aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-01-04 19:17:33 -0500
committerTejun Heo <tj@kernel.org>2010-01-04 19:17:33 -0500
commit32032df6c2f6c9c6b2ada2ce42322231824f70c2 (patch)
treeb1ce838a37044bb38dfc128e2116ca35630e629a /arch/x86/kernel
parent22b737f4c75197372d64afc6ed1bccd58c00e549 (diff)
parentc5974b835a909ff15c3b7e6cf6789b5eb919f419 (diff)
Merge branch 'master' into percpu
Conflicts: arch/powerpc/platforms/pseries/hvCall.S include/linux/percpu.h
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c100
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S3
-rw-r--r--arch/x86/kernel/acpi/sleep.c26
-rw-r--r--arch/x86/kernel/amd_iommu.c1295
-rw-r--r--arch/x86/kernel/amd_iommu_init.c140
-rw-r--r--arch/x86/kernel/aperture_64.c13
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c38
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c5
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c18
-rw-r--r--arch/x86/kernel/apic/es7000_32.c28
-rw-r--r--arch/x86/kernel/apic/io_apic.c449
-rw-r--r--arch/x86/kernel/apic/nmi.c11
-rw-r--r--arch/x86/kernel/apic/numaq_32.c18
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c52
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/bios_uv.c8
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c15
-rw-r--r--arch/x86/kernel/cpu/amd.c57
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c44
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c34
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c9
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c155
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c45
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c53
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event.c241
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpuid.c22
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c19
-rw-r--r--arch/x86/kernel/dumpstack.c48
-rw-r--r--arch/x86/kernel/dumpstack.h6
-rw-r--r--arch/x86/kernel/dumpstack_32.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c83
-rw-r--r--arch/x86/kernel/e820.c17
-rw-r--r--arch/x86/kernel/early_printk.c5
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c101
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S18
-rw-r--r--arch/x86/kernel/head_64.S7
-rw-r--r--arch/x86/kernel/hpet.c77
-rw-r--r--arch/x86/kernel/hw_breakpoint.c554
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c2
-rw-r--r--arch/x86/kernel/ioport.c28
-rw-r--r--arch/x86/kernel/irq.c130
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c21
-rw-r--r--arch/x86/kernel/kprobes.c261
-rw-r--r--arch/x86/kernel/machine_kexec_32.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c99
-rw-r--r--arch/x86/kernel/microcode_core.c34
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mpparse.c47
-rw-r--r--arch/x86/kernel/msr.c23
-rw-r--r--arch/x86/kernel/olpc.c4
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c100
-rw-r--r--arch/x86/kernel/pci-dma.c49
-rw-r--r--arch/x86/kernel/pci-gart_64.c164
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c114
-rw-r--r--arch/x86/kernel/process_32.c111
-rw-r--r--arch/x86/kernel/process_64.c122
-rw-r--r--arch/x86/kernel/ptrace.c480
-rw-r--r--arch/x86/kernel/quirks.c9
-rw-r--r--arch/x86/kernel/reboot.c21
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c3
-rw-r--r--arch/x86/kernel/setup.c123
-rw-r--r--arch/x86/kernel/setup_percpu.c13
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smpboot.c58
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/sys_i386_32.c27
-rw-r--r--arch/x86/kernel/sys_x86_64.c17
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/time.c3
-rw-r--r--arch/x86/kernel/tlb_uv.c13
-rw-r--r--arch/x86/kernel/trampoline.c30
-rw-r--r--arch/x86/kernel/trampoline_64.S4
-rw-r--r--arch/x86/kernel/traps.c73
-rw-r--r--arch/x86/kernel/tsc.c1
-rw-r--r--arch/x86/kernel/tsc_sync.c23
-rw-r--r--arch/x86/kernel/uv_irq.c238
-rw-r--r--arch/x86/kernel/uv_time.c80
-rw-r--r--arch/x86/kernel/visws_quirks.c10
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S45
-rw-r--r--arch/x86/kernel/vsyscall_64.c7
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c10
128 files changed, 4450 insertions, 3508 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..d87f09bc5a52 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -89,7 +89,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90 90
91obj-$(CONFIG_K8_NB) += k8.o 91obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 93obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 94
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..6f35260bb3ef 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o 4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
5 5
6ifneq ($(CONFIG_ACPI_PROCESSOR),) 6ifneq ($(CONFIG_ACPI_PROCESSOR),)
7obj-y += cstate.o processor.o 7obj-y += cstate.o
8endif 8endif
9 9
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin 10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..fb1035cd9a6a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -624,6 +624,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 624 }
625 625
626 hpet_address = hpet_tbl->address.address; 626 hpet_address = hpet_tbl->address.address;
627 hpet_blockid = hpet_tbl->sequence;
627 628
628 /* 629 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 630 * Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1123,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1122 if (!acpi_sci_override_gsi) 1123 if (!acpi_sci_override_gsi)
1123 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1124 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
1124 1125
1125 /* Fill in identity legacy mapings where no override */ 1126 /* Fill in identity legacy mappings where no override */
1126 mp_config_acpi_legacy_irqs(); 1127 mp_config_acpi_legacy_irqs();
1127 1128
1128 count = 1129 count =
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 59cdfa4686b2..2e837f5080fe 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d296f4a195c9..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,100 +0,0 @@
1/*
2 * Copyright (C) 2005 Intel Corporation
3 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
4 * - Added _PDC for platforms with Intel CPUs
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/acpi.h>
11
12#include <acpi/processor.h>
13#include <asm/acpi.h>
14
15static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
16{
17 struct acpi_object_list *obj_list;
18 union acpi_object *obj;
19 u32 *buf;
20
21 /* allocate and initialize pdc. It will be used later. */
22 obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
23 if (!obj_list) {
24 printk(KERN_ERR "Memory allocation error\n");
25 return;
26 }
27
28 obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
29 if (!obj) {
30 printk(KERN_ERR "Memory allocation error\n");
31 kfree(obj_list);
32 return;
33 }
34
35 buf = kmalloc(12, GFP_KERNEL);
36 if (!buf) {
37 printk(KERN_ERR "Memory allocation error\n");
38 kfree(obj);
39 kfree(obj_list);
40 return;
41 }
42
43 buf[0] = ACPI_PDC_REVISION_ID;
44 buf[1] = 1;
45 buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
46
47 /*
48 * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
49 * that OSPM is capable of native ACPI throttling software
50 * coordination using BIOS supplied _TSD info.
51 */
52 buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
53 if (cpu_has(c, X86_FEATURE_EST))
54 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
55
56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH;
58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
65 obj->type = ACPI_TYPE_BUFFER;
66 obj->buffer.length = 12;
67 obj->buffer.pointer = (u8 *) buf;
68 obj_list->count = 1;
69 obj_list->pointer = obj;
70 pr->pdc = obj_list;
71
72 return;
73}
74
75
76/* Initialize _PDC data based on the CPU vendor */
77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
78{
79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80
81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL)
83 init_intel_pdc(pr, c);
84
85 return;
86}
87
88EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
89
90void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
91{
92 if (pr->pdc) {
93 kfree(pr->pdc->pointer->buffer.pointer);
94 kfree(pr->pdc->pointer);
95 kfree(pr->pdc);
96 pr->pdc = NULL;
97 }
98}
99
100EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cda..060fff8f5c5b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -57,5 +57,8 @@ SECTIONS
57 *(.note*) 57 *(.note*)
58 } 58 }
59 59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
60 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); 63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
61} 64}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 98f230f6a28d..23824fef789c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -19,7 +19,7 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/gfp.h>
22#include <linux/bitops.h> 22#include <linux/bitmap.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,152 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
169
170void __init amd_iommu_uninit_devices(void)
171{
172 struct pci_dev *pdev = NULL;
173
174 for_each_pci_dev(pdev) {
175
176 if (!check_device(&pdev->dev))
177 continue;
178
179 iommu_uninit_device(&pdev->dev);
180 }
181}
182
183int __init amd_iommu_init_devices(void)
184{
185 struct pci_dev *pdev = NULL;
186 int ret = 0;
187
188 for_each_pci_dev(pdev) {
189
190 if (!check_device(&pdev->dev))
191 continue;
192
193 ret = iommu_init_device(&pdev->dev);
194 if (ret)
195 goto out_free;
196 }
197
198 return 0;
199
200out_free:
201
202 amd_iommu_uninit_devices();
203
204 return ret;
205}
73#ifdef CONFIG_AMD_IOMMU_STATS 206#ifdef CONFIG_AMD_IOMMU_STATS
74 207
75/* 208/*
@@ -90,7 +223,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 223DECLARE_STATS_COUNTER(total_map_requests);
91 224
92static struct dentry *stats_dir; 225static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 226static struct dentry *de_fflush;
95 227
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 228static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +240,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 240 if (stats_dir == NULL)
109 return; 241 return;
110 242
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 243 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 244 (u32 *)&amd_iommu_unmap_flush);
116 245
@@ -130,12 +259,6 @@ static void amd_iommu_stats_init(void)
130 259
131#endif 260#endif
132 261
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 262/****************************************************************************
140 * 263 *
141 * Interrupt handling functions 264 * Interrupt handling functions
@@ -199,6 +322,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 322 break;
200 case EVENT_TYPE_ILL_CMD: 323 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 326 reset_iommu_command_buffer(iommu);
203 dump_command(address); 327 dump_command(address);
204 break; 328 break;
@@ -321,11 +445,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 445 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 446 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 447
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 448 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 449 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 450}
330 451
331/* 452/*
@@ -372,26 +493,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 493out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 494 spin_unlock_irqrestore(&iommu->lock, flags);
374 495
496 if (iommu->reset_in_progress)
497 reset_iommu_command_buffer(iommu);
498
375 return 0; 499 return 0;
376} 500}
377 501
502static void iommu_flush_complete(struct protection_domain *domain)
503{
504 int i;
505
506 for (i = 0; i < amd_iommus_present; ++i) {
507 if (!domain->dev_iommu[i])
508 continue;
509
510 /*
511 * Devices of this domain are behind this IOMMU
512 * We need to wait for completion of all commands.
513 */
514 iommu_completion_wait(amd_iommus[i]);
515 }
516}
517
378/* 518/*
379 * Command send function for invalidating a device table entry 519 * Command send function for invalidating a device table entry
380 */ 520 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 521static int iommu_flush_device(struct device *dev)
382{ 522{
523 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 524 struct iommu_cmd cmd;
384 int ret; 525 u16 devid;
385 526
386 BUG_ON(iommu == NULL); 527 devid = get_device_id(dev);
528 iommu = amd_iommu_rlookup_table[devid];
387 529
530 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 531 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 532 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 533 cmd.data[0] = devid;
391 534
392 ret = iommu_queue_command(iommu, &cmd); 535 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 536}
396 537
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 538static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +571,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 571 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 572 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 573 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 574static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 575 u64 address, size_t size, int pde)
435{ 576{
436 int s = 0; 577 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 578 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 579
439 address &= PAGE_MASK; 580 address &= PAGE_MASK;
440 581
@@ -447,142 +588,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 588 s = 1;
448 } 589 }
449 590
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 591
452 return 0; 592 for (i = 0; i < amd_iommus_present; ++i) {
593 if (!domain->dev_iommu[i])
594 continue;
595
596 /*
597 * Devices of this domain are behind this IOMMU
598 * We need a TLB flush
599 */
600 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
601 domain->id, pde, s);
602 }
603
604 return;
453} 605}
454 606
455/* Flush the whole IO/TLB for a given protection domain */ 607static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 608 u64 address, size_t size)
457{ 609{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 610 __iommu_flush_pages(domain, address, size, 0);
459 611}
460 INC_STATS_COUNTER(domain_flush_single);
461 612
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 613/* Flush the whole IO/TLB for a given protection domain */
614static void iommu_flush_tlb(struct protection_domain *domain)
615{
616 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 617}
464 618
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 619/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 620static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 621{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 622 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 623}
474 624
625
475/* 626/*
476 * This function flushes one domain on one IOMMU 627 * This function flushes the DTEs for all devices in domain
477 */ 628 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 629static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 630{
480 struct iommu_cmd cmd; 631 struct iommu_dev_data *dev_data;
481 unsigned long flags; 632 unsigned long flags;
482 633
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 634 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 635
486 spin_lock_irqsave(&iommu->lock, flags); 636 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 637 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 638
489 __iommu_wait_for_completion(iommu); 639 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 640}
492 641
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 642static void iommu_flush_all_domain_devices(void)
494{ 643{
495 int i; 644 struct protection_domain *domain;
645 unsigned long flags;
496 646
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 647 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 648
499 continue; 649 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 650 iommu_flush_domain_devices(domain);
651 iommu_flush_complete(domain);
501 } 652 }
502 653
654 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
655}
656
657void amd_iommu_flush_all_devices(void)
658{
659 iommu_flush_all_domain_devices();
503} 660}
504 661
505/* 662/*
506 * This function is used to flush the IO/TLB for a given protection domain 663 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 664 * this is no issue because it is only called during resume.
508 */ 665 */
509static void iommu_flush_domain(u16 domid) 666void amd_iommu_flush_all_domains(void)
510{ 667{
511 struct amd_iommu *iommu; 668 struct protection_domain *domain;
669 unsigned long flags;
512 670
513 INC_STATS_COUNTER(domain_flush_all); 671 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 672
515 for_each_iommu(iommu) 673 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 674 spin_lock(&domain->lock);
675 iommu_flush_tlb_pde(domain);
676 iommu_flush_complete(domain);
677 spin_unlock(&domain->lock);
678 }
679
680 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 681}
518 682
519void amd_iommu_flush_all_domains(void) 683static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 684{
521 struct amd_iommu *iommu; 685 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 686
523 for_each_iommu(iommu) 687 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 688 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
689
690 amd_iommu_reset_cmd_buffer(iommu);
691 amd_iommu_flush_all_devices();
692 amd_iommu_flush_all_domains();
693
694 iommu->reset_in_progress = false;
525} 695}
526 696
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 697/****************************************************************************
698 *
699 * The functions below are used the create the page table mappings for
700 * unity mapped regions.
701 *
702 ****************************************************************************/
703
704/*
705 * This function is used to add another level to an IO page table. Adding
706 * another level increases the size of the address space by 9 bits to a size up
707 * to 64 bits.
708 */
709static bool increase_address_space(struct protection_domain *domain,
710 gfp_t gfp)
528{ 711{
529 int i; 712 u64 *pte;
530 713
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 714 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 715 /* address space already 64 bit large */
533 continue; 716 return false;
534 717
535 iommu_queue_inv_dev_entry(iommu, i); 718 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 719 if (!pte)
537 } 720 return false;
721
722 *pte = PM_LEVEL_PDE(domain->mode,
723 virt_to_phys(domain->pt_root));
724 domain->pt_root = pte;
725 domain->mode += 1;
726 domain->updated = true;
727
728 return true;
538} 729}
539 730
540static void flush_devices_by_domain(struct protection_domain *domain) 731static u64 *alloc_pte(struct protection_domain *domain,
732 unsigned long address,
733 int end_lvl,
734 u64 **pte_page,
735 gfp_t gfp)
541{ 736{
542 struct amd_iommu *iommu; 737 u64 *pte, *page;
543 int i; 738 int level;
544 739
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 740 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 741 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 742
550 iommu = amd_iommu_rlookup_table[i]; 743 level = domain->mode - 1;
551 if (!iommu) 744 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue; 745
746 while (level > end_lvl) {
747 if (!IOMMU_PTE_PRESENT(*pte)) {
748 page = (u64 *)get_zeroed_page(gfp);
749 if (!page)
750 return NULL;
751 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
752 }
753
754 level -= 1;
755
756 pte = IOMMU_PTE_PAGE(*pte);
757
758 if (pte_page && level == end_lvl)
759 *pte_page = pte;
553 760
554 iommu_queue_inv_dev_entry(iommu, i); 761 pte = &pte[PM_LEVEL_INDEX(level, address)];
555 iommu_completion_wait(iommu);
556 } 762 }
763
764 return pte;
557} 765}
558 766
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 767/*
768 * This function checks if there is a PTE for a given dma address. If
769 * there is one, it returns the pointer to it.
770 */
771static u64 *fetch_pte(struct protection_domain *domain,
772 unsigned long address, int map_size)
560{ 773{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 774 int level;
775 u64 *pte;
562 776
563 if (iommu->reset_in_progress) 777 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 778 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 779
566 iommu->reset_in_progress = true; 780 while (level > map_size) {
781 if (!IOMMU_PTE_PRESENT(*pte))
782 return NULL;
567 783
568 amd_iommu_reset_cmd_buffer(iommu); 784 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 785
572 iommu->reset_in_progress = false; 786 pte = IOMMU_PTE_PAGE(*pte);
573} 787 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 788
575void amd_iommu_flush_all_devices(void) 789 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 790 pte = NULL;
577 flush_devices_by_domain(NULL); 791 break;
578} 792 }
793 }
579 794
580/**************************************************************************** 795 return pte;
581 * 796}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 797
587/* 798/*
588 * Generic mapping functions. It maps a physical address into a DMA 799 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +865,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 865}
655 866
656/* 867/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 868 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 869 * dma_ops domain.
681 */ 870 */
@@ -704,6 +893,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 893}
705 894
706/* 895/*
896 * Init the unity mappings for a specific IOMMU in the system
897 *
898 * Basically iterates over all unity mapping entries and applies them to
899 * the default domain DMA of that IOMMU if necessary.
900 */
901static int iommu_init_unity_mappings(struct amd_iommu *iommu)
902{
903 struct unity_map_entry *entry;
904 int ret;
905
906 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
907 if (!iommu_for_unity_map(iommu, entry))
908 continue;
909 ret = dma_ops_unity_map(iommu->default_dom, entry);
910 if (ret)
911 return ret;
912 }
913
914 return 0;
915}
916
917/*
707 * Inits the unity mappings required for a specific device 918 * Inits the unity mappings required for a specific device
708 */ 919 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 920static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +951,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 951 */
741 952
742/* 953/*
743 * This function checks if there is a PTE for a given dma address. If 954 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 955 * ranges.
745 */ 956 */
746static u64 *fetch_pte(struct protection_domain *domain, 957static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 958 unsigned long start_page,
959 unsigned int pages)
748{ 960{
749 int level; 961 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 962
761 pte = IOMMU_PTE_PAGE(*pte); 963 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 964 pages = last_page - start_page;
763 965
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 966 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 967 int index = i / APERTURE_RANGE_PAGES;
766 break; 968 int page = i % APERTURE_RANGE_PAGES;
767 } 969 __set_bit(page, dom->aperture[index]->bitmap);
768 } 970 }
769
770 return pte;
771} 971}
772 972
773/* 973/*
@@ -775,11 +975,11 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 975 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 976 * failure.
777 */ 977 */
778static int alloc_new_range(struct amd_iommu *iommu, 978static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 979 bool populate, gfp_t gfp)
781{ 980{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 981 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
982 struct amd_iommu *iommu;
783 int i; 983 int i;
784 984
785#ifdef CONFIG_IOMMU_STRESS 985#ifdef CONFIG_IOMMU_STRESS
@@ -819,14 +1019,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1019 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 1020
821 /* Intialize the exclusion range if necessary */ 1021 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 1022 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 1023 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 1024 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 1025 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 1026 unsigned long startpage;
827 iommu->exclusion_length, 1027 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 1028 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1029 PAGE_SIZE);
1030 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1031 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1032 }
830 } 1033 }
831 1034
832 /* 1035 /*
@@ -928,7 +1131,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1131 }
929 1132
930 if (unlikely(address == -1)) 1133 if (unlikely(address == -1))
931 address = bad_dma_address; 1134 address = DMA_ERROR_CODE;
932 1135
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1136 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1137
@@ -959,7 +1162,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
959 1162
960 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1163 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
961 1164
962 iommu_area_free(range->bitmap, address, pages); 1165 bitmap_clear(range->bitmap, address, pages);
963 1166
964} 1167}
965 1168
@@ -973,6 +1176,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1176 *
974 ****************************************************************************/ 1177 ****************************************************************************/
975 1178
1179/*
1180 * This function adds a protection domain to the global protection domain list
1181 */
1182static void add_domain_to_list(struct protection_domain *domain)
1183{
1184 unsigned long flags;
1185
1186 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1187 list_add(&domain->list, &amd_iommu_pd_list);
1188 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1189}
1190
1191/*
1192 * This function removes a protection domain to the global
1193 * protection domain list
1194 */
1195static void del_domain_from_list(struct protection_domain *domain)
1196{
1197 unsigned long flags;
1198
1199 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1200 list_del(&domain->list);
1201 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1202}
1203
976static u16 domain_id_alloc(void) 1204static u16 domain_id_alloc(void)
977{ 1205{
978 unsigned long flags; 1206 unsigned long flags;
@@ -1000,26 +1228,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1228 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1229}
1002 1230
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1231static void free_pagetable(struct protection_domain *domain)
1024{ 1232{
1025 int i, j; 1233 int i, j;
@@ -1061,6 +1269,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1269 if (!dom)
1062 return; 1270 return;
1063 1271
1272 del_domain_from_list(&dom->domain);
1273
1064 free_pagetable(&dom->domain); 1274 free_pagetable(&dom->domain);
1065 1275
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1276 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1288,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1288 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1289 * structures required for the dma_ops interface
1080 */ 1290 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1291static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1292{
1083 struct dma_ops_domain *dma_dom; 1293 struct dma_ops_domain *dma_dom;
1084 1294
@@ -1091,6 +1301,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1301 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1302 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1303 goto free_dma_dom;
1304 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1305 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1306 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1307 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1312,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1312 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1313 dma_dom->target_dev = 0xffff;
1103 1314
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1315 add_domain_to_list(&dma_dom->domain);
1316
1317 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1318 goto free_dma_dom;
1106 1319
1107 /* 1320 /*
@@ -1129,22 +1342,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1342 return domain->flags & PD_DMA_OPS_MASK;
1130} 1343}
1131 1344
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1345static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1346{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1347 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1353,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1353 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1354 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1355 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1356}
1159 1357
1160 amd_iommu_pd_table[devid] = domain; 1358static void clear_dte_entry(u16 devid)
1359{
1360 /* remove entry from the device table seen by the hardware */
1361 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1362 amd_iommu_dev_table[devid].data[1] = 0;
1363 amd_iommu_dev_table[devid].data[2] = 0;
1364
1365 amd_iommu_apply_erratum_63(devid);
1366}
1367
1368static void do_attach(struct device *dev, struct protection_domain *domain)
1369{
1370 struct iommu_dev_data *dev_data;
1371 struct amd_iommu *iommu;
1372 u16 devid;
1373
1374 devid = get_device_id(dev);
1375 iommu = amd_iommu_rlookup_table[devid];
1376 dev_data = get_dev_data(dev);
1377
1378 /* Update data structures */
1379 dev_data->domain = domain;
1380 list_add(&dev_data->list, &domain->dev_list);
1381 set_dte_entry(devid, domain);
1382
1383 /* Do reference counting */
1384 domain->dev_iommu[iommu->index] += 1;
1385 domain->dev_cnt += 1;
1386
1387 /* Flush the DTE entry */
1388 iommu_flush_device(dev);
1389}
1390
1391static void do_detach(struct device *dev)
1392{
1393 struct iommu_dev_data *dev_data;
1394 struct amd_iommu *iommu;
1395 u16 devid;
1396
1397 devid = get_device_id(dev);
1398 iommu = amd_iommu_rlookup_table[devid];
1399 dev_data = get_dev_data(dev);
1400
1401 /* decrease reference counters */
1402 dev_data->domain->dev_iommu[iommu->index] -= 1;
1403 dev_data->domain->dev_cnt -= 1;
1404
1405 /* Update data structures */
1406 dev_data->domain = NULL;
1407 list_del(&dev_data->list);
1408 clear_dte_entry(devid);
1409
1410 /* Flush the DTE entry */
1411 iommu_flush_device(dev);
1161} 1412}
1162 1413
1163/* 1414/*
1164 * If a device is not yet associated with a domain, this function does 1415 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1416 * assigns it visible for the hardware
1166 */ 1417 */
1167static void __attach_device(struct amd_iommu *iommu, 1418static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1419 struct protection_domain *domain)
1169 u16 devid)
1170{ 1420{
1421 struct iommu_dev_data *dev_data, *alias_data;
1422
1423 dev_data = get_dev_data(dev);
1424 alias_data = get_dev_data(dev_data->alias);
1425
1426 if (!alias_data)
1427 return -EINVAL;
1428
1171 /* lock domain */ 1429 /* lock domain */
1172 spin_lock(&domain->lock); 1430 spin_lock(&domain->lock);
1173 1431
1174 /* update DTE entry */ 1432 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1433 if (alias_data->domain != NULL &&
1434 alias_data->domain != domain)
1435 return -EBUSY;
1436
1437 if (dev_data->domain != NULL &&
1438 dev_data->domain != domain)
1439 return -EBUSY;
1176 1440
1177 domain->dev_cnt += 1; 1441 /* Do real assignment */
1442 if (dev_data->alias != dev) {
1443 alias_data = get_dev_data(dev_data->alias);
1444 if (alias_data->domain == NULL)
1445 do_attach(dev_data->alias, domain);
1446
1447 atomic_inc(&alias_data->bind);
1448 }
1449
1450 if (dev_data->domain == NULL)
1451 do_attach(dev, domain);
1452
1453 atomic_inc(&dev_data->bind);
1178 1454
1179 /* ready */ 1455 /* ready */
1180 spin_unlock(&domain->lock); 1456 spin_unlock(&domain->lock);
1457
1458 return 0;
1181} 1459}
1182 1460
1183/* 1461/*
1184 * If a device is not yet associated with a domain, this function does 1462 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1463 * assigns it visible for the hardware
1186 */ 1464 */
1187static void attach_device(struct amd_iommu *iommu, 1465static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1466 struct protection_domain *domain)
1189 u16 devid)
1190{ 1467{
1191 unsigned long flags; 1468 unsigned long flags;
1469 int ret;
1192 1470
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1471 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1472 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1473 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1474
1197 /* 1475 /*
@@ -1199,96 +1477,125 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1477 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1478 * here to evict all dirty stuff.
1201 */ 1479 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1480 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1481
1482 return ret;
1204} 1483}
1205 1484
1206/* 1485/*
1207 * Removes a device from a protection domain (unlocked) 1486 * Removes a device from a protection domain (unlocked)
1208 */ 1487 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1488static void __detach_device(struct device *dev)
1210{ 1489{
1490 struct iommu_dev_data *dev_data = get_dev_data(dev);
1491 struct iommu_dev_data *alias_data;
1492 unsigned long flags;
1211 1493
1212 /* lock domain */ 1494 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214 1495
1215 /* remove domain from the lookup table */ 1496 spin_lock_irqsave(&dev_data->domain->lock, flags);
1216 amd_iommu_pd_table[devid] = NULL;
1217 1497
1218 /* remove entry from the device table seen by the hardware */ 1498 if (dev_data->alias != dev) {
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; 1499 alias_data = get_dev_data(dev_data->alias);
1220 amd_iommu_dev_table[devid].data[1] = 0; 1500 if (atomic_dec_and_test(&alias_data->bind))
1221 amd_iommu_dev_table[devid].data[2] = 0; 1501 do_detach(dev_data->alias);
1502 }
1222 1503
1223 /* decrease reference counter */ 1504 if (atomic_dec_and_test(&dev_data->bind))
1224 domain->dev_cnt -= 1; 1505 do_detach(dev);
1225 1506
1226 /* ready */ 1507 spin_unlock_irqrestore(&dev_data->domain->lock, flags);
1227 spin_unlock(&domain->lock);
1228 1508
1229 /* 1509 /*
1230 * If we run in passthrough mode the device must be assigned to the 1510 * If we run in passthrough mode the device must be assigned to the
1231 * passthrough domain if it is detached from any other domain 1511 * passthrough domain if it is detached from any other domain
1232 */ 1512 */
1233 if (iommu_pass_through) { 1513 if (iommu_pass_through && dev_data->domain == NULL)
1234 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1514 __attach_device(dev, pt_domain);
1235 __attach_device(iommu, pt_domain, devid);
1236 }
1237} 1515}
1238 1516
1239/* 1517/*
1240 * Removes a device from a protection domain (with devtable_lock held) 1518 * Removes a device from a protection domain (with devtable_lock held)
1241 */ 1519 */
1242static void detach_device(struct protection_domain *domain, u16 devid) 1520static void detach_device(struct device *dev)
1243{ 1521{
1244 unsigned long flags; 1522 unsigned long flags;
1245 1523
1246 /* lock device table */ 1524 /* lock device table */
1247 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1525 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1248 __detach_device(domain, devid); 1526 __detach_device(dev);
1249 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1527 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1250} 1528}
1251 1529
1530/*
1531 * Find out the protection domain structure for a given PCI device. This
1532 * will give us the pointer to the page table root for example.
1533 */
1534static struct protection_domain *domain_for_device(struct device *dev)
1535{
1536 struct protection_domain *dom;
1537 struct iommu_dev_data *dev_data, *alias_data;
1538 unsigned long flags;
1539 u16 devid, alias;
1540
1541 devid = get_device_id(dev);
1542 alias = amd_iommu_alias_table[devid];
1543 dev_data = get_dev_data(dev);
1544 alias_data = get_dev_data(dev_data->alias);
1545 if (!alias_data)
1546 return NULL;
1547
1548 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1549 dom = dev_data->domain;
1550 if (dom == NULL &&
1551 alias_data->domain != NULL) {
1552 __attach_device(dev, alias_data->domain);
1553 dom = alias_data->domain;
1554 }
1555
1556 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1557
1558 return dom;
1559}
1560
1252static int device_change_notifier(struct notifier_block *nb, 1561static int device_change_notifier(struct notifier_block *nb,
1253 unsigned long action, void *data) 1562 unsigned long action, void *data)
1254{ 1563{
1255 struct device *dev = data; 1564 struct device *dev = data;
1256 struct pci_dev *pdev = to_pci_dev(dev); 1565 u16 devid;
1257 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1258 struct protection_domain *domain; 1566 struct protection_domain *domain;
1259 struct dma_ops_domain *dma_domain; 1567 struct dma_ops_domain *dma_domain;
1260 struct amd_iommu *iommu; 1568 struct amd_iommu *iommu;
1261 unsigned long flags; 1569 unsigned long flags;
1262 1570
1263 if (devid > amd_iommu_last_bdf) 1571 if (!check_device(dev))
1264 goto out; 1572 return 0;
1265
1266 devid = amd_iommu_alias_table[devid];
1267
1268 iommu = amd_iommu_rlookup_table[devid];
1269 if (iommu == NULL)
1270 goto out;
1271
1272 domain = domain_for_device(devid);
1273 1573
1274 if (domain && !dma_ops_domain(domain)) 1574 devid = get_device_id(dev);
1275 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1575 iommu = amd_iommu_rlookup_table[devid];
1276 "to a non-dma-ops domain\n", dev_name(dev));
1277 1576
1278 switch (action) { 1577 switch (action) {
1279 case BUS_NOTIFY_UNBOUND_DRIVER: 1578 case BUS_NOTIFY_UNBOUND_DRIVER:
1579
1580 domain = domain_for_device(dev);
1581
1280 if (!domain) 1582 if (!domain)
1281 goto out; 1583 goto out;
1282 if (iommu_pass_through) 1584 if (iommu_pass_through)
1283 break; 1585 break;
1284 detach_device(domain, devid); 1586 detach_device(dev);
1285 break; 1587 break;
1286 case BUS_NOTIFY_ADD_DEVICE: 1588 case BUS_NOTIFY_ADD_DEVICE:
1589
1590 iommu_init_device(dev);
1591
1592 domain = domain_for_device(dev);
1593
1287 /* allocate a protection domain if a device is added */ 1594 /* allocate a protection domain if a device is added */
1288 dma_domain = find_protection_domain(devid); 1595 dma_domain = find_protection_domain(devid);
1289 if (dma_domain) 1596 if (dma_domain)
1290 goto out; 1597 goto out;
1291 dma_domain = dma_ops_domain_alloc(iommu); 1598 dma_domain = dma_ops_domain_alloc();
1292 if (!dma_domain) 1599 if (!dma_domain)
1293 goto out; 1600 goto out;
1294 dma_domain->target_dev = devid; 1601 dma_domain->target_dev = devid;
@@ -1298,11 +1605,15 @@ static int device_change_notifier(struct notifier_block *nb,
1298 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1605 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1299 1606
1300 break; 1607 break;
1608 case BUS_NOTIFY_DEL_DEVICE:
1609
1610 iommu_uninit_device(dev);
1611
1301 default: 1612 default:
1302 goto out; 1613 goto out;
1303 } 1614 }
1304 1615
1305 iommu_queue_inv_dev_entry(iommu, devid); 1616 iommu_flush_device(dev);
1306 iommu_completion_wait(iommu); 1617 iommu_completion_wait(iommu);
1307 1618
1308out: 1619out:
@@ -1313,6 +1624,11 @@ static struct notifier_block device_nb = {
1313 .notifier_call = device_change_notifier, 1624 .notifier_call = device_change_notifier,
1314}; 1625};
1315 1626
1627void amd_iommu_init_notifier(void)
1628{
1629 bus_register_notifier(&pci_bus_type, &device_nb);
1630}
1631
1316/***************************************************************************** 1632/*****************************************************************************
1317 * 1633 *
1318 * The next functions belong to the dma_ops mapping/unmapping code. 1634 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1320,106 +1636,46 @@ static struct notifier_block device_nb = {
1320 *****************************************************************************/ 1636 *****************************************************************************/
1321 1637
1322/* 1638/*
1323 * This function checks if the driver got a valid device from the caller to
1324 * avoid dereferencing invalid pointers.
1325 */
1326static bool check_device(struct device *dev)
1327{
1328 if (!dev || !dev->dma_mask)
1329 return false;
1330
1331 return true;
1332}
1333
1334/*
1335 * In this function the list of preallocated protection domains is traversed to
1336 * find the domain for a specific device
1337 */
1338static struct dma_ops_domain *find_protection_domain(u16 devid)
1339{
1340 struct dma_ops_domain *entry, *ret = NULL;
1341 unsigned long flags;
1342
1343 if (list_empty(&iommu_pd_list))
1344 return NULL;
1345
1346 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1347
1348 list_for_each_entry(entry, &iommu_pd_list, list) {
1349 if (entry->target_dev == devid) {
1350 ret = entry;
1351 break;
1352 }
1353 }
1354
1355 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1356
1357 return ret;
1358}
1359
1360/*
1361 * In the dma_ops path we only have the struct device. This function 1639 * In the dma_ops path we only have the struct device. This function
1362 * finds the corresponding IOMMU, the protection domain and the 1640 * finds the corresponding IOMMU, the protection domain and the
1363 * requestor id for a given device. 1641 * requestor id for a given device.
1364 * If the device is not yet associated with a domain this is also done 1642 * If the device is not yet associated with a domain this is also done
1365 * in this function. 1643 * in this function.
1366 */ 1644 */
1367static int get_device_resources(struct device *dev, 1645static struct protection_domain *get_domain(struct device *dev)
1368 struct amd_iommu **iommu,
1369 struct protection_domain **domain,
1370 u16 *bdf)
1371{ 1646{
1647 struct protection_domain *domain;
1372 struct dma_ops_domain *dma_dom; 1648 struct dma_ops_domain *dma_dom;
1373 struct pci_dev *pcidev; 1649 u16 devid = get_device_id(dev);
1374 u16 _bdf;
1375
1376 *iommu = NULL;
1377 *domain = NULL;
1378 *bdf = 0xffff;
1379
1380 if (dev->bus != &pci_bus_type)
1381 return 0;
1382
1383 pcidev = to_pci_dev(dev);
1384 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1385 1650
1386 /* device not translated by any IOMMU in the system? */ 1651 if (!check_device(dev))
1387 if (_bdf > amd_iommu_last_bdf) 1652 return ERR_PTR(-EINVAL);
1388 return 0;
1389 1653
1390 *bdf = amd_iommu_alias_table[_bdf]; 1654 domain = domain_for_device(dev);
1655 if (domain != NULL && !dma_ops_domain(domain))
1656 return ERR_PTR(-EBUSY);
1391 1657
1392 *iommu = amd_iommu_rlookup_table[*bdf]; 1658 if (domain != NULL)
1393 if (*iommu == NULL) 1659 return domain;
1394 return 0;
1395 *domain = domain_for_device(*bdf);
1396 if (*domain == NULL) {
1397 dma_dom = find_protection_domain(*bdf);
1398 if (!dma_dom)
1399 dma_dom = (*iommu)->default_dom;
1400 *domain = &dma_dom->domain;
1401 attach_device(*iommu, *domain, *bdf);
1402 DUMP_printk("Using protection domain %d for device %s\n",
1403 (*domain)->id, dev_name(dev));
1404 }
1405 1660
1406 if (domain_for_device(_bdf) == NULL) 1661 /* Device not bount yet - bind it */
1407 attach_device(*iommu, *domain, _bdf); 1662 dma_dom = find_protection_domain(devid);
1663 if (!dma_dom)
1664 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1665 attach_device(dev, &dma_dom->domain);
1666 DUMP_printk("Using protection domain %d for device %s\n",
1667 dma_dom->domain.id, dev_name(dev));
1408 1668
1409 return 1; 1669 return &dma_dom->domain;
1410} 1670}
1411 1671
1412static void update_device_table(struct protection_domain *domain) 1672static void update_device_table(struct protection_domain *domain)
1413{ 1673{
1414 unsigned long flags; 1674 struct iommu_dev_data *dev_data;
1415 int i;
1416 1675
1417 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1676 list_for_each_entry(dev_data, &domain->dev_list, list) {
1418 if (amd_iommu_pd_table[i] != domain) 1677 u16 devid = get_device_id(dev_data->dev);
1419 continue; 1678 set_dte_entry(devid, domain);
1420 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1421 set_dte_entry(i, domain);
1422 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1423 } 1679 }
1424} 1680}
1425 1681
@@ -1429,76 +1685,13 @@ static void update_domain(struct protection_domain *domain)
1429 return; 1685 return;
1430 1686
1431 update_device_table(domain); 1687 update_device_table(domain);
1432 flush_devices_by_domain(domain); 1688 iommu_flush_domain_devices(domain);
1433 iommu_flush_domain(domain->id); 1689 iommu_flush_tlb_pde(domain);
1434 1690
1435 domain->updated = false; 1691 domain->updated = false;
1436} 1692}
1437 1693
1438/* 1694/*
1439 * This function is used to add another level to an IO page table. Adding
1440 * another level increases the size of the address space by 9 bits to a size up
1441 * to 64 bits.
1442 */
1443static bool increase_address_space(struct protection_domain *domain,
1444 gfp_t gfp)
1445{
1446 u64 *pte;
1447
1448 if (domain->mode == PAGE_MODE_6_LEVEL)
1449 /* address space already 64 bit large */
1450 return false;
1451
1452 pte = (void *)get_zeroed_page(gfp);
1453 if (!pte)
1454 return false;
1455
1456 *pte = PM_LEVEL_PDE(domain->mode,
1457 virt_to_phys(domain->pt_root));
1458 domain->pt_root = pte;
1459 domain->mode += 1;
1460 domain->updated = true;
1461
1462 return true;
1463}
1464
1465static u64 *alloc_pte(struct protection_domain *domain,
1466 unsigned long address,
1467 int end_lvl,
1468 u64 **pte_page,
1469 gfp_t gfp)
1470{
1471 u64 *pte, *page;
1472 int level;
1473
1474 while (address > PM_LEVEL_SIZE(domain->mode))
1475 increase_address_space(domain, gfp);
1476
1477 level = domain->mode - 1;
1478 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1479
1480 while (level > end_lvl) {
1481 if (!IOMMU_PTE_PRESENT(*pte)) {
1482 page = (u64 *)get_zeroed_page(gfp);
1483 if (!page)
1484 return NULL;
1485 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1486 }
1487
1488 level -= 1;
1489
1490 pte = IOMMU_PTE_PAGE(*pte);
1491
1492 if (pte_page && level == end_lvl)
1493 *pte_page = pte;
1494
1495 pte = &pte[PM_LEVEL_INDEX(level, address)];
1496 }
1497
1498 return pte;
1499}
1500
1501/*
1502 * This function fetches the PTE for a given address in the aperture 1695 * This function fetches the PTE for a given address in the aperture
1503 */ 1696 */
1504static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1697static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1528,8 +1721,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1528 * This is the generic map function. It maps one 4kb page at paddr to 1721 * This is the generic map function. It maps one 4kb page at paddr to
1529 * the given address in the DMA address space for the domain. 1722 * the given address in the DMA address space for the domain.
1530 */ 1723 */
1531static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1724static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1532 struct dma_ops_domain *dom,
1533 unsigned long address, 1725 unsigned long address,
1534 phys_addr_t paddr, 1726 phys_addr_t paddr,
1535 int direction) 1727 int direction)
@@ -1542,7 +1734,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1542 1734
1543 pte = dma_ops_get_pte(dom, address); 1735 pte = dma_ops_get_pte(dom, address);
1544 if (!pte) 1736 if (!pte)
1545 return bad_dma_address; 1737 return DMA_ERROR_CODE;
1546 1738
1547 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1739 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1548 1740
@@ -1563,8 +1755,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1563/* 1755/*
1564 * The generic unmapping function for on page in the DMA address space. 1756 * The generic unmapping function for on page in the DMA address space.
1565 */ 1757 */
1566static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1758static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1567 struct dma_ops_domain *dom,
1568 unsigned long address) 1759 unsigned long address)
1569{ 1760{
1570 struct aperture_range *aperture; 1761 struct aperture_range *aperture;
@@ -1595,7 +1786,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1595 * Must be called with the domain lock held. 1786 * Must be called with the domain lock held.
1596 */ 1787 */
1597static dma_addr_t __map_single(struct device *dev, 1788static dma_addr_t __map_single(struct device *dev,
1598 struct amd_iommu *iommu,
1599 struct dma_ops_domain *dma_dom, 1789 struct dma_ops_domain *dma_dom,
1600 phys_addr_t paddr, 1790 phys_addr_t paddr,
1601 size_t size, 1791 size_t size,
@@ -1623,7 +1813,7 @@ static dma_addr_t __map_single(struct device *dev,
1623retry: 1813retry:
1624 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1814 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1625 dma_mask); 1815 dma_mask);
1626 if (unlikely(address == bad_dma_address)) { 1816 if (unlikely(address == DMA_ERROR_CODE)) {
1627 /* 1817 /*
1628 * setting next_address here will let the address 1818 * setting next_address here will let the address
1629 * allocator only scan the new allocated range in the 1819 * allocator only scan the new allocated range in the
@@ -1631,11 +1821,11 @@ retry:
1631 */ 1821 */
1632 dma_dom->next_address = dma_dom->aperture_size; 1822 dma_dom->next_address = dma_dom->aperture_size;
1633 1823
1634 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1824 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1635 goto out; 1825 goto out;
1636 1826
1637 /* 1827 /*
1638 * aperture was sucessfully enlarged by 128 MB, try 1828 * aperture was successfully enlarged by 128 MB, try
1639 * allocation again 1829 * allocation again
1640 */ 1830 */
1641 goto retry; 1831 goto retry;
@@ -1643,8 +1833,8 @@ retry:
1643 1833
1644 start = address; 1834 start = address;
1645 for (i = 0; i < pages; ++i) { 1835 for (i = 0; i < pages; ++i) {
1646 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1836 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1647 if (ret == bad_dma_address) 1837 if (ret == DMA_ERROR_CODE)
1648 goto out_unmap; 1838 goto out_unmap;
1649 1839
1650 paddr += PAGE_SIZE; 1840 paddr += PAGE_SIZE;
@@ -1655,10 +1845,10 @@ retry:
1655 ADD_STATS_COUNTER(alloced_io_mem, size); 1845 ADD_STATS_COUNTER(alloced_io_mem, size);
1656 1846
1657 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1847 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1658 iommu_flush_tlb(iommu, dma_dom->domain.id); 1848 iommu_flush_tlb(&dma_dom->domain);
1659 dma_dom->need_flush = false; 1849 dma_dom->need_flush = false;
1660 } else if (unlikely(iommu_has_npcache(iommu))) 1850 } else if (unlikely(amd_iommu_np_cache))
1661 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1851 iommu_flush_pages(&dma_dom->domain, address, size);
1662 1852
1663out: 1853out:
1664 return address; 1854 return address;
@@ -1667,20 +1857,19 @@ out_unmap:
1667 1857
1668 for (--i; i >= 0; --i) { 1858 for (--i; i >= 0; --i) {
1669 start -= PAGE_SIZE; 1859 start -= PAGE_SIZE;
1670 dma_ops_domain_unmap(iommu, dma_dom, start); 1860 dma_ops_domain_unmap(dma_dom, start);
1671 } 1861 }
1672 1862
1673 dma_ops_free_addresses(dma_dom, address, pages); 1863 dma_ops_free_addresses(dma_dom, address, pages);
1674 1864
1675 return bad_dma_address; 1865 return DMA_ERROR_CODE;
1676} 1866}
1677 1867
1678/* 1868/*
1679 * Does the reverse of the __map_single function. Must be called with 1869 * Does the reverse of the __map_single function. Must be called with
1680 * the domain lock held too 1870 * the domain lock held too
1681 */ 1871 */
1682static void __unmap_single(struct amd_iommu *iommu, 1872static void __unmap_single(struct dma_ops_domain *dma_dom,
1683 struct dma_ops_domain *dma_dom,
1684 dma_addr_t dma_addr, 1873 dma_addr_t dma_addr,
1685 size_t size, 1874 size_t size,
1686 int dir) 1875 int dir)
@@ -1688,7 +1877,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1688 dma_addr_t i, start; 1877 dma_addr_t i, start;
1689 unsigned int pages; 1878 unsigned int pages;
1690 1879
1691 if ((dma_addr == bad_dma_address) || 1880 if ((dma_addr == DMA_ERROR_CODE) ||
1692 (dma_addr + size > dma_dom->aperture_size)) 1881 (dma_addr + size > dma_dom->aperture_size))
1693 return; 1882 return;
1694 1883
@@ -1697,7 +1886,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1697 start = dma_addr; 1886 start = dma_addr;
1698 1887
1699 for (i = 0; i < pages; ++i) { 1888 for (i = 0; i < pages; ++i) {
1700 dma_ops_domain_unmap(iommu, dma_dom, start); 1889 dma_ops_domain_unmap(dma_dom, start);
1701 start += PAGE_SIZE; 1890 start += PAGE_SIZE;
1702 } 1891 }
1703 1892
@@ -1706,7 +1895,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1706 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1895 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1707 1896
1708 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1897 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1709 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1898 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1710 dma_dom->need_flush = false; 1899 dma_dom->need_flush = false;
1711 } 1900 }
1712} 1901}
@@ -1720,36 +1909,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1720 struct dma_attrs *attrs) 1909 struct dma_attrs *attrs)
1721{ 1910{
1722 unsigned long flags; 1911 unsigned long flags;
1723 struct amd_iommu *iommu;
1724 struct protection_domain *domain; 1912 struct protection_domain *domain;
1725 u16 devid;
1726 dma_addr_t addr; 1913 dma_addr_t addr;
1727 u64 dma_mask; 1914 u64 dma_mask;
1728 phys_addr_t paddr = page_to_phys(page) + offset; 1915 phys_addr_t paddr = page_to_phys(page) + offset;
1729 1916
1730 INC_STATS_COUNTER(cnt_map_single); 1917 INC_STATS_COUNTER(cnt_map_single);
1731 1918
1732 if (!check_device(dev)) 1919 domain = get_domain(dev);
1733 return bad_dma_address; 1920 if (PTR_ERR(domain) == -EINVAL)
1734
1735 dma_mask = *dev->dma_mask;
1736
1737 get_device_resources(dev, &iommu, &domain, &devid);
1738
1739 if (iommu == NULL || domain == NULL)
1740 /* device not handled by any AMD IOMMU */
1741 return (dma_addr_t)paddr; 1921 return (dma_addr_t)paddr;
1922 else if (IS_ERR(domain))
1923 return DMA_ERROR_CODE;
1742 1924
1743 if (!dma_ops_domain(domain)) 1925 dma_mask = *dev->dma_mask;
1744 return bad_dma_address;
1745 1926
1746 spin_lock_irqsave(&domain->lock, flags); 1927 spin_lock_irqsave(&domain->lock, flags);
1747 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1928
1929 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1748 dma_mask); 1930 dma_mask);
1749 if (addr == bad_dma_address) 1931 if (addr == DMA_ERROR_CODE)
1750 goto out; 1932 goto out;
1751 1933
1752 iommu_completion_wait(iommu); 1934 iommu_flush_complete(domain);
1753 1935
1754out: 1936out:
1755 spin_unlock_irqrestore(&domain->lock, flags); 1937 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1764,25 +1946,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1764 enum dma_data_direction dir, struct dma_attrs *attrs) 1946 enum dma_data_direction dir, struct dma_attrs *attrs)
1765{ 1947{
1766 unsigned long flags; 1948 unsigned long flags;
1767 struct amd_iommu *iommu;
1768 struct protection_domain *domain; 1949 struct protection_domain *domain;
1769 u16 devid;
1770 1950
1771 INC_STATS_COUNTER(cnt_unmap_single); 1951 INC_STATS_COUNTER(cnt_unmap_single);
1772 1952
1773 if (!check_device(dev) || 1953 domain = get_domain(dev);
1774 !get_device_resources(dev, &iommu, &domain, &devid)) 1954 if (IS_ERR(domain))
1775 /* device not handled by any AMD IOMMU */
1776 return;
1777
1778 if (!dma_ops_domain(domain))
1779 return; 1955 return;
1780 1956
1781 spin_lock_irqsave(&domain->lock, flags); 1957 spin_lock_irqsave(&domain->lock, flags);
1782 1958
1783 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1959 __unmap_single(domain->priv, dma_addr, size, dir);
1784 1960
1785 iommu_completion_wait(iommu); 1961 iommu_flush_complete(domain);
1786 1962
1787 spin_unlock_irqrestore(&domain->lock, flags); 1963 spin_unlock_irqrestore(&domain->lock, flags);
1788} 1964}
@@ -1814,9 +1990,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1814 struct dma_attrs *attrs) 1990 struct dma_attrs *attrs)
1815{ 1991{
1816 unsigned long flags; 1992 unsigned long flags;
1817 struct amd_iommu *iommu;
1818 struct protection_domain *domain; 1993 struct protection_domain *domain;
1819 u16 devid;
1820 int i; 1994 int i;
1821 struct scatterlist *s; 1995 struct scatterlist *s;
1822 phys_addr_t paddr; 1996 phys_addr_t paddr;
@@ -1825,25 +1999,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1825 1999
1826 INC_STATS_COUNTER(cnt_map_sg); 2000 INC_STATS_COUNTER(cnt_map_sg);
1827 2001
1828 if (!check_device(dev)) 2002 domain = get_domain(dev);
2003 if (PTR_ERR(domain) == -EINVAL)
2004 return map_sg_no_iommu(dev, sglist, nelems, dir);
2005 else if (IS_ERR(domain))
1829 return 0; 2006 return 0;
1830 2007
1831 dma_mask = *dev->dma_mask; 2008 dma_mask = *dev->dma_mask;
1832 2009
1833 get_device_resources(dev, &iommu, &domain, &devid);
1834
1835 if (!iommu || !domain)
1836 return map_sg_no_iommu(dev, sglist, nelems, dir);
1837
1838 if (!dma_ops_domain(domain))
1839 return 0;
1840
1841 spin_lock_irqsave(&domain->lock, flags); 2010 spin_lock_irqsave(&domain->lock, flags);
1842 2011
1843 for_each_sg(sglist, s, nelems, i) { 2012 for_each_sg(sglist, s, nelems, i) {
1844 paddr = sg_phys(s); 2013 paddr = sg_phys(s);
1845 2014
1846 s->dma_address = __map_single(dev, iommu, domain->priv, 2015 s->dma_address = __map_single(dev, domain->priv,
1847 paddr, s->length, dir, false, 2016 paddr, s->length, dir, false,
1848 dma_mask); 2017 dma_mask);
1849 2018
@@ -1854,7 +2023,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1854 goto unmap; 2023 goto unmap;
1855 } 2024 }
1856 2025
1857 iommu_completion_wait(iommu); 2026 iommu_flush_complete(domain);
1858 2027
1859out: 2028out:
1860 spin_unlock_irqrestore(&domain->lock, flags); 2029 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1863,7 +2032,7 @@ out:
1863unmap: 2032unmap:
1864 for_each_sg(sglist, s, mapped_elems, i) { 2033 for_each_sg(sglist, s, mapped_elems, i) {
1865 if (s->dma_address) 2034 if (s->dma_address)
1866 __unmap_single(iommu, domain->priv, s->dma_address, 2035 __unmap_single(domain->priv, s->dma_address,
1867 s->dma_length, dir); 2036 s->dma_length, dir);
1868 s->dma_address = s->dma_length = 0; 2037 s->dma_address = s->dma_length = 0;
1869 } 2038 }
@@ -1882,30 +2051,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1882 struct dma_attrs *attrs) 2051 struct dma_attrs *attrs)
1883{ 2052{
1884 unsigned long flags; 2053 unsigned long flags;
1885 struct amd_iommu *iommu;
1886 struct protection_domain *domain; 2054 struct protection_domain *domain;
1887 struct scatterlist *s; 2055 struct scatterlist *s;
1888 u16 devid;
1889 int i; 2056 int i;
1890 2057
1891 INC_STATS_COUNTER(cnt_unmap_sg); 2058 INC_STATS_COUNTER(cnt_unmap_sg);
1892 2059
1893 if (!check_device(dev) || 2060 domain = get_domain(dev);
1894 !get_device_resources(dev, &iommu, &domain, &devid)) 2061 if (IS_ERR(domain))
1895 return;
1896
1897 if (!dma_ops_domain(domain))
1898 return; 2062 return;
1899 2063
1900 spin_lock_irqsave(&domain->lock, flags); 2064 spin_lock_irqsave(&domain->lock, flags);
1901 2065
1902 for_each_sg(sglist, s, nelems, i) { 2066 for_each_sg(sglist, s, nelems, i) {
1903 __unmap_single(iommu, domain->priv, s->dma_address, 2067 __unmap_single(domain->priv, s->dma_address,
1904 s->dma_length, dir); 2068 s->dma_length, dir);
1905 s->dma_address = s->dma_length = 0; 2069 s->dma_address = s->dma_length = 0;
1906 } 2070 }
1907 2071
1908 iommu_completion_wait(iommu); 2072 iommu_flush_complete(domain);
1909 2073
1910 spin_unlock_irqrestore(&domain->lock, flags); 2074 spin_unlock_irqrestore(&domain->lock, flags);
1911} 2075}
@@ -1918,49 +2082,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1918{ 2082{
1919 unsigned long flags; 2083 unsigned long flags;
1920 void *virt_addr; 2084 void *virt_addr;
1921 struct amd_iommu *iommu;
1922 struct protection_domain *domain; 2085 struct protection_domain *domain;
1923 u16 devid;
1924 phys_addr_t paddr; 2086 phys_addr_t paddr;
1925 u64 dma_mask = dev->coherent_dma_mask; 2087 u64 dma_mask = dev->coherent_dma_mask;
1926 2088
1927 INC_STATS_COUNTER(cnt_alloc_coherent); 2089 INC_STATS_COUNTER(cnt_alloc_coherent);
1928 2090
1929 if (!check_device(dev)) 2091 domain = get_domain(dev);
2092 if (PTR_ERR(domain) == -EINVAL) {
2093 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2094 *dma_addr = __pa(virt_addr);
2095 return virt_addr;
2096 } else if (IS_ERR(domain))
1930 return NULL; 2097 return NULL;
1931 2098
1932 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2099 dma_mask = dev->coherent_dma_mask;
1933 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2100 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2101 flag |= __GFP_ZERO;
1934 2102
1935 flag |= __GFP_ZERO;
1936 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2103 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1937 if (!virt_addr) 2104 if (!virt_addr)
1938 return NULL; 2105 return NULL;
1939 2106
1940 paddr = virt_to_phys(virt_addr); 2107 paddr = virt_to_phys(virt_addr);
1941 2108
1942 if (!iommu || !domain) {
1943 *dma_addr = (dma_addr_t)paddr;
1944 return virt_addr;
1945 }
1946
1947 if (!dma_ops_domain(domain))
1948 goto out_free;
1949
1950 if (!dma_mask) 2109 if (!dma_mask)
1951 dma_mask = *dev->dma_mask; 2110 dma_mask = *dev->dma_mask;
1952 2111
1953 spin_lock_irqsave(&domain->lock, flags); 2112 spin_lock_irqsave(&domain->lock, flags);
1954 2113
1955 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2114 *dma_addr = __map_single(dev, domain->priv, paddr,
1956 size, DMA_BIDIRECTIONAL, true, dma_mask); 2115 size, DMA_BIDIRECTIONAL, true, dma_mask);
1957 2116
1958 if (*dma_addr == bad_dma_address) { 2117 if (*dma_addr == DMA_ERROR_CODE) {
1959 spin_unlock_irqrestore(&domain->lock, flags); 2118 spin_unlock_irqrestore(&domain->lock, flags);
1960 goto out_free; 2119 goto out_free;
1961 } 2120 }
1962 2121
1963 iommu_completion_wait(iommu); 2122 iommu_flush_complete(domain);
1964 2123
1965 spin_unlock_irqrestore(&domain->lock, flags); 2124 spin_unlock_irqrestore(&domain->lock, flags);
1966 2125
@@ -1980,28 +2139,19 @@ static void free_coherent(struct device *dev, size_t size,
1980 void *virt_addr, dma_addr_t dma_addr) 2139 void *virt_addr, dma_addr_t dma_addr)
1981{ 2140{
1982 unsigned long flags; 2141 unsigned long flags;
1983 struct amd_iommu *iommu;
1984 struct protection_domain *domain; 2142 struct protection_domain *domain;
1985 u16 devid;
1986 2143
1987 INC_STATS_COUNTER(cnt_free_coherent); 2144 INC_STATS_COUNTER(cnt_free_coherent);
1988 2145
1989 if (!check_device(dev)) 2146 domain = get_domain(dev);
1990 return; 2147 if (IS_ERR(domain))
1991
1992 get_device_resources(dev, &iommu, &domain, &devid);
1993
1994 if (!iommu || !domain)
1995 goto free_mem;
1996
1997 if (!dma_ops_domain(domain))
1998 goto free_mem; 2148 goto free_mem;
1999 2149
2000 spin_lock_irqsave(&domain->lock, flags); 2150 spin_lock_irqsave(&domain->lock, flags);
2001 2151
2002 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2152 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2003 2153
2004 iommu_completion_wait(iommu); 2154 iommu_flush_complete(domain);
2005 2155
2006 spin_unlock_irqrestore(&domain->lock, flags); 2156 spin_unlock_irqrestore(&domain->lock, flags);
2007 2157
@@ -2015,22 +2165,7 @@ free_mem:
2015 */ 2165 */
2016static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2166static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2017{ 2167{
2018 u16 bdf; 2168 return check_device(dev);
2019 struct pci_dev *pcidev;
2020
2021 /* No device or no PCI device */
2022 if (!dev || dev->bus != &pci_bus_type)
2023 return 0;
2024
2025 pcidev = to_pci_dev(dev);
2026
2027 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2028
2029 /* Out of our scope? */
2030 if (bdf > amd_iommu_last_bdf)
2031 return 0;
2032
2033 return 1;
2034} 2169}
2035 2170
2036/* 2171/*
@@ -2044,25 +2179,28 @@ static void prealloc_protection_domains(void)
2044{ 2179{
2045 struct pci_dev *dev = NULL; 2180 struct pci_dev *dev = NULL;
2046 struct dma_ops_domain *dma_dom; 2181 struct dma_ops_domain *dma_dom;
2047 struct amd_iommu *iommu;
2048 u16 devid; 2182 u16 devid;
2049 2183
2050 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2184 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2051 devid = calc_devid(dev->bus->number, dev->devfn); 2185
2052 if (devid > amd_iommu_last_bdf) 2186 /* Do we handle this device? */
2053 continue; 2187 if (!check_device(&dev->dev))
2054 devid = amd_iommu_alias_table[devid];
2055 if (domain_for_device(devid))
2056 continue; 2188 continue;
2057 iommu = amd_iommu_rlookup_table[devid]; 2189
2058 if (!iommu) 2190 /* Is there already any domain for it? */
2191 if (domain_for_device(&dev->dev))
2059 continue; 2192 continue;
2060 dma_dom = dma_ops_domain_alloc(iommu); 2193
2194 devid = get_device_id(&dev->dev);
2195
2196 dma_dom = dma_ops_domain_alloc();
2061 if (!dma_dom) 2197 if (!dma_dom)
2062 continue; 2198 continue;
2063 init_unity_mappings_for_device(dma_dom, devid); 2199 init_unity_mappings_for_device(dma_dom, devid);
2064 dma_dom->target_dev = devid; 2200 dma_dom->target_dev = devid;
2065 2201
2202 attach_device(&dev->dev, &dma_dom->domain);
2203
2066 list_add_tail(&dma_dom->list, &iommu_pd_list); 2204 list_add_tail(&dma_dom->list, &iommu_pd_list);
2067 } 2205 }
2068} 2206}
@@ -2091,7 +2229,7 @@ int __init amd_iommu_init_dma_ops(void)
2091 * protection domain will be assigned to the default one. 2229 * protection domain will be assigned to the default one.
2092 */ 2230 */
2093 for_each_iommu(iommu) { 2231 for_each_iommu(iommu) {
2094 iommu->default_dom = dma_ops_domain_alloc(iommu); 2232 iommu->default_dom = dma_ops_domain_alloc();
2095 if (iommu->default_dom == NULL) 2233 if (iommu->default_dom == NULL)
2096 return -ENOMEM; 2234 return -ENOMEM;
2097 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2235 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2101,15 +2239,12 @@ int __init amd_iommu_init_dma_ops(void)
2101 } 2239 }
2102 2240
2103 /* 2241 /*
2104 * If device isolation is enabled, pre-allocate the protection 2242 * Pre-allocate the protection domains for each device.
2105 * domains for each device.
2106 */ 2243 */
2107 if (amd_iommu_isolate) 2244 prealloc_protection_domains();
2108 prealloc_protection_domains();
2109 2245
2110 iommu_detected = 1; 2246 iommu_detected = 1;
2111 force_iommu = 1; 2247 swiotlb = 0;
2112 bad_dma_address = 0;
2113#ifdef CONFIG_GART_IOMMU 2248#ifdef CONFIG_GART_IOMMU
2114 gart_iommu_aperture_disabled = 1; 2249 gart_iommu_aperture_disabled = 1;
2115 gart_iommu_aperture = 0; 2250 gart_iommu_aperture = 0;
@@ -2120,8 +2255,6 @@ int __init amd_iommu_init_dma_ops(void)
2120 2255
2121 register_iommu(&amd_iommu_ops); 2256 register_iommu(&amd_iommu_ops);
2122 2257
2123 bus_register_notifier(&pci_bus_type, &device_nb);
2124
2125 amd_iommu_stats_init(); 2258 amd_iommu_stats_init();
2126 2259
2127 return 0; 2260 return 0;
@@ -2148,14 +2281,17 @@ free_domains:
2148 2281
2149static void cleanup_domain(struct protection_domain *domain) 2282static void cleanup_domain(struct protection_domain *domain)
2150{ 2283{
2284 struct iommu_dev_data *dev_data, *next;
2151 unsigned long flags; 2285 unsigned long flags;
2152 u16 devid;
2153 2286
2154 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2287 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2155 2288
2156 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2289 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2157 if (amd_iommu_pd_table[devid] == domain) 2290 struct device *dev = dev_data->dev;
2158 __detach_device(domain, devid); 2291
2292 do_detach(dev);
2293 atomic_set(&dev_data->bind, 0);
2294 }
2159 2295
2160 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2296 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2161} 2297}
@@ -2165,6 +2301,8 @@ static void protection_domain_free(struct protection_domain *domain)
2165 if (!domain) 2301 if (!domain)
2166 return; 2302 return;
2167 2303
2304 del_domain_from_list(domain);
2305
2168 if (domain->id) 2306 if (domain->id)
2169 domain_id_free(domain->id); 2307 domain_id_free(domain->id);
2170 2308
@@ -2183,6 +2321,9 @@ static struct protection_domain *protection_domain_alloc(void)
2183 domain->id = domain_id_alloc(); 2321 domain->id = domain_id_alloc();
2184 if (!domain->id) 2322 if (!domain->id)
2185 goto out_err; 2323 goto out_err;
2324 INIT_LIST_HEAD(&domain->dev_list);
2325
2326 add_domain_to_list(domain);
2186 2327
2187 return domain; 2328 return domain;
2188 2329
@@ -2239,26 +2380,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2239static void amd_iommu_detach_device(struct iommu_domain *dom, 2380static void amd_iommu_detach_device(struct iommu_domain *dom,
2240 struct device *dev) 2381 struct device *dev)
2241{ 2382{
2242 struct protection_domain *domain = dom->priv; 2383 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2243 struct amd_iommu *iommu; 2384 struct amd_iommu *iommu;
2244 struct pci_dev *pdev;
2245 u16 devid; 2385 u16 devid;
2246 2386
2247 if (dev->bus != &pci_bus_type) 2387 if (!check_device(dev))
2248 return; 2388 return;
2249 2389
2250 pdev = to_pci_dev(dev); 2390 devid = get_device_id(dev);
2251 2391
2252 devid = calc_devid(pdev->bus->number, pdev->devfn); 2392 if (dev_data->domain != NULL)
2253 2393 detach_device(dev);
2254 if (devid > 0)
2255 detach_device(domain, devid);
2256 2394
2257 iommu = amd_iommu_rlookup_table[devid]; 2395 iommu = amd_iommu_rlookup_table[devid];
2258 if (!iommu) 2396 if (!iommu)
2259 return; 2397 return;
2260 2398
2261 iommu_queue_inv_dev_entry(iommu, devid); 2399 iommu_flush_device(dev);
2262 iommu_completion_wait(iommu); 2400 iommu_completion_wait(iommu);
2263} 2401}
2264 2402
@@ -2266,35 +2404,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2266 struct device *dev) 2404 struct device *dev)
2267{ 2405{
2268 struct protection_domain *domain = dom->priv; 2406 struct protection_domain *domain = dom->priv;
2269 struct protection_domain *old_domain; 2407 struct iommu_dev_data *dev_data;
2270 struct amd_iommu *iommu; 2408 struct amd_iommu *iommu;
2271 struct pci_dev *pdev; 2409 int ret;
2272 u16 devid; 2410 u16 devid;
2273 2411
2274 if (dev->bus != &pci_bus_type) 2412 if (!check_device(dev))
2275 return -EINVAL; 2413 return -EINVAL;
2276 2414
2277 pdev = to_pci_dev(dev); 2415 dev_data = dev->archdata.iommu;
2278
2279 devid = calc_devid(pdev->bus->number, pdev->devfn);
2280 2416
2281 if (devid >= amd_iommu_last_bdf || 2417 devid = get_device_id(dev);
2282 devid != amd_iommu_alias_table[devid])
2283 return -EINVAL;
2284 2418
2285 iommu = amd_iommu_rlookup_table[devid]; 2419 iommu = amd_iommu_rlookup_table[devid];
2286 if (!iommu) 2420 if (!iommu)
2287 return -EINVAL; 2421 return -EINVAL;
2288 2422
2289 old_domain = domain_for_device(devid); 2423 if (dev_data->domain)
2290 if (old_domain) 2424 detach_device(dev);
2291 detach_device(old_domain, devid);
2292 2425
2293 attach_device(iommu, domain, devid); 2426 ret = attach_device(dev, domain);
2294 2427
2295 iommu_completion_wait(iommu); 2428 iommu_completion_wait(iommu);
2296 2429
2297 return 0; 2430 return ret;
2298} 2431}
2299 2432
2300static int amd_iommu_map_range(struct iommu_domain *dom, 2433static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2340,7 +2473,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2340 iova += PAGE_SIZE; 2473 iova += PAGE_SIZE;
2341 } 2474 }
2342 2475
2343 iommu_flush_domain(domain->id); 2476 iommu_flush_tlb_pde(domain);
2344} 2477}
2345 2478
2346static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2479static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2391,10 +2524,11 @@ static struct iommu_ops amd_iommu_ops = {
2391 2524
2392int __init amd_iommu_init_passthrough(void) 2525int __init amd_iommu_init_passthrough(void)
2393{ 2526{
2527 struct amd_iommu *iommu;
2394 struct pci_dev *dev = NULL; 2528 struct pci_dev *dev = NULL;
2395 u16 devid, devid2; 2529 u16 devid;
2396 2530
2397 /* allocate passthroug domain */ 2531 /* allocate passthrough domain */
2398 pt_domain = protection_domain_alloc(); 2532 pt_domain = protection_domain_alloc();
2399 if (!pt_domain) 2533 if (!pt_domain)
2400 return -ENOMEM; 2534 return -ENOMEM;
@@ -2402,20 +2536,17 @@ int __init amd_iommu_init_passthrough(void)
2402 pt_domain->mode |= PAGE_MODE_NONE; 2536 pt_domain->mode |= PAGE_MODE_NONE;
2403 2537
2404 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2538 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2405 struct amd_iommu *iommu;
2406 2539
2407 devid = calc_devid(dev->bus->number, dev->devfn); 2540 if (!check_device(&dev->dev))
2408 if (devid > amd_iommu_last_bdf)
2409 continue; 2541 continue;
2410 2542
2411 devid2 = amd_iommu_alias_table[devid]; 2543 devid = get_device_id(&dev->dev);
2412 2544
2413 iommu = amd_iommu_rlookup_table[devid2]; 2545 iommu = amd_iommu_rlookup_table[devid];
2414 if (!iommu) 2546 if (!iommu)
2415 continue; 2547 continue;
2416 2548
2417 __attach_device(iommu, pt_domain, devid); 2549 attach_device(&dev->dev, pt_domain);
2418 __attach_device(iommu, pt_domain, devid2);
2419 } 2550 }
2420 2551
2421 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2552 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b4b61d462dcc..fb490ce7dd55 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,10 +25,12 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,29 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * Set to true if ACPI table parsing and hardware intialization went properly
142 */
143static bool amd_iommu_initialized;
144
145/*
146 * List of protection domains - used during resume
147 */
148LIST_HEAD(amd_iommu_pd_list);
149spinlock_t amd_iommu_pd_lock;
150
138/* 151/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 152 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 153 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +170,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 170struct amd_iommu **amd_iommu_rlookup_table;
158 171
159/* 172/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 173 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 174 * to know which ones are already in use.
168 */ 175 */
@@ -240,7 +247,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
240 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 247 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
241} 248}
242 249
243static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) 250static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
244{ 251{
245 u32 ctrl; 252 u32 ctrl;
246 253
@@ -519,6 +526,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
519 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 526 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
520} 527}
521 528
529static int get_dev_entry_bit(u16 devid, u8 bit)
530{
531 int i = (bit >> 5) & 0x07;
532 int _bit = bit & 0x1f;
533
534 return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
535}
536
537
538void amd_iommu_apply_erratum_63(u16 devid)
539{
540 int sysmgt;
541
542 sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
543 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
544
545 if (sysmgt == 0x01)
546 set_dev_entry_bit(devid, DEV_ENTRY_IW);
547}
548
522/* Writes the specific IOMMU for a device into the rlookup table */ 549/* Writes the specific IOMMU for a device into the rlookup table */
523static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 550static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
524{ 551{
@@ -547,6 +574,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
547 if (flags & ACPI_DEVFLAG_LINT1) 574 if (flags & ACPI_DEVFLAG_LINT1)
548 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 575 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
549 576
577 amd_iommu_apply_erratum_63(devid);
578
550 set_iommu_for_device(iommu, devid); 579 set_iommu_for_device(iommu, devid);
551} 580}
552 581
@@ -816,7 +845,18 @@ static void __init free_iommu_all(void)
816static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 845static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
817{ 846{
818 spin_lock_init(&iommu->lock); 847 spin_lock_init(&iommu->lock);
848
849 /* Add IOMMU to internal data structures */
819 list_add_tail(&iommu->list, &amd_iommu_list); 850 list_add_tail(&iommu->list, &amd_iommu_list);
851 iommu->index = amd_iommus_present++;
852
853 if (unlikely(iommu->index >= MAX_IOMMUS)) {
854 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
855 return -ENOSYS;
856 }
857
858 /* Index is fine - add IOMMU to the array */
859 amd_iommus[iommu->index] = iommu;
820 860
821 /* 861 /*
822 * Copy data from ACPI table entry to the iommu struct 862 * Copy data from ACPI table entry to the iommu struct
@@ -846,6 +886,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
846 init_iommu_from_acpi(iommu, h); 886 init_iommu_from_acpi(iommu, h);
847 init_iommu_devices(iommu); 887 init_iommu_devices(iommu);
848 888
889 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
890 amd_iommu_np_cache = true;
891
849 return pci_enable_device(iommu->dev); 892 return pci_enable_device(iommu->dev);
850} 893}
851 894
@@ -891,6 +934,8 @@ static int __init init_iommu_all(struct acpi_table_header *table)
891 } 934 }
892 WARN_ON(p != end); 935 WARN_ON(p != end);
893 936
937 amd_iommu_initialized = true;
938
894 return 0; 939 return 0;
895} 940}
896 941
@@ -903,7 +948,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
903 * 948 *
904 ****************************************************************************/ 949 ****************************************************************************/
905 950
906static int __init iommu_setup_msi(struct amd_iommu *iommu) 951static int iommu_setup_msi(struct amd_iommu *iommu)
907{ 952{
908 int r; 953 int r;
909 954
@@ -1154,19 +1199,10 @@ static struct sys_device device_amd_iommu = {
1154 * functions. Finally it prints some information about AMD IOMMUs and 1199 * functions. Finally it prints some information about AMD IOMMUs and
1155 * the driver state and enables the hardware. 1200 * the driver state and enables the hardware.
1156 */ 1201 */
1157int __init amd_iommu_init(void) 1202static int __init amd_iommu_init(void)
1158{ 1203{
1159 int i, ret = 0; 1204 int i, ret = 0;
1160 1205
1161
1162 if (no_iommu) {
1163 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1164 return 0;
1165 }
1166
1167 if (!amd_iommu_detected)
1168 return -ENODEV;
1169
1170 /* 1206 /*
1171 * First parse ACPI tables to find the largest Bus/Dev/Func 1207 * First parse ACPI tables to find the largest Bus/Dev/Func
1172 * we need to handle. Upon this information the shared data 1208 * we need to handle. Upon this information the shared data
@@ -1203,15 +1239,6 @@ int __init amd_iommu_init(void)
1203 if (amd_iommu_rlookup_table == NULL) 1239 if (amd_iommu_rlookup_table == NULL)
1204 goto free; 1240 goto free;
1205 1241
1206 /*
1207 * Protection Domain table - maps devices to protection domains
1208 * This table has the same size as the rlookup_table
1209 */
1210 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1211 get_order(rlookup_table_size));
1212 if (amd_iommu_pd_table == NULL)
1213 goto free;
1214
1215 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1242 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1216 GFP_KERNEL | __GFP_ZERO, 1243 GFP_KERNEL | __GFP_ZERO,
1217 get_order(MAX_DOMAIN_ID/8)); 1244 get_order(MAX_DOMAIN_ID/8));
@@ -1233,6 +1260,8 @@ int __init amd_iommu_init(void)
1233 */ 1260 */
1234 amd_iommu_pd_alloc_bitmap[0] = 1; 1261 amd_iommu_pd_alloc_bitmap[0] = 1;
1235 1262
1263 spin_lock_init(&amd_iommu_pd_lock);
1264
1236 /* 1265 /*
1237 * now the data structures are allocated and basically initialized 1266 * now the data structures are allocated and basically initialized
1238 * start the real acpi table scan 1267 * start the real acpi table scan
@@ -1241,6 +1270,9 @@ int __init amd_iommu_init(void)
1241 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1270 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1242 goto free; 1271 goto free;
1243 1272
1273 if (!amd_iommu_initialized)
1274 goto free;
1275
1244 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1276 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1245 goto free; 1277 goto free;
1246 1278
@@ -1252,6 +1284,10 @@ int __init amd_iommu_init(void)
1252 if (ret) 1284 if (ret)
1253 goto free; 1285 goto free;
1254 1286
1287 ret = amd_iommu_init_devices();
1288 if (ret)
1289 goto free;
1290
1255 if (iommu_pass_through) 1291 if (iommu_pass_through)
1256 ret = amd_iommu_init_passthrough(); 1292 ret = amd_iommu_init_passthrough();
1257 else 1293 else
@@ -1259,32 +1295,29 @@ int __init amd_iommu_init(void)
1259 if (ret) 1295 if (ret)
1260 goto free; 1296 goto free;
1261 1297
1298 amd_iommu_init_notifier();
1299
1262 enable_iommus(); 1300 enable_iommus();
1263 1301
1264 if (iommu_pass_through) 1302 if (iommu_pass_through)
1265 goto out; 1303 goto out;
1266 1304
1267 printk(KERN_INFO "AMD-Vi: device isolation ");
1268 if (amd_iommu_isolate)
1269 printk("enabled\n");
1270 else
1271 printk("disabled\n");
1272
1273 if (amd_iommu_unmap_flush) 1305 if (amd_iommu_unmap_flush)
1274 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1306 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1275 else 1307 else
1276 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1308 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1277 1309
1310 x86_platform.iommu_shutdown = disable_iommus;
1278out: 1311out:
1279 return ret; 1312 return ret;
1280 1313
1281free: 1314free:
1315
1316 amd_iommu_uninit_devices();
1317
1282 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1318 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1283 get_order(MAX_DOMAIN_ID/8)); 1319 get_order(MAX_DOMAIN_ID/8));
1284 1320
1285 free_pages((unsigned long)amd_iommu_pd_table,
1286 get_order(rlookup_table_size));
1287
1288 free_pages((unsigned long)amd_iommu_rlookup_table, 1321 free_pages((unsigned long)amd_iommu_rlookup_table,
1289 get_order(rlookup_table_size)); 1322 get_order(rlookup_table_size));
1290 1323
@@ -1301,11 +1334,6 @@ free:
1301 goto out; 1334 goto out;
1302} 1335}
1303 1336
1304void amd_iommu_shutdown(void)
1305{
1306 disable_iommus();
1307}
1308
1309/**************************************************************************** 1337/****************************************************************************
1310 * 1338 *
1311 * Early detect code. This code runs at IOMMU detection time in the DMA 1339 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1320,16 +1348,16 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1320 1348
1321void __init amd_iommu_detect(void) 1349void __init amd_iommu_detect(void)
1322{ 1350{
1323 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1351 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1324 return; 1352 return;
1325 1353
1326 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1354 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1327 iommu_detected = 1; 1355 iommu_detected = 1;
1328 amd_iommu_detected = 1; 1356 amd_iommu_detected = 1;
1329#ifdef CONFIG_GART_IOMMU 1357 x86_init.iommu.iommu_init = amd_iommu_init;
1330 gart_iommu_aperture_disabled = 1; 1358
1331 gart_iommu_aperture = 0; 1359 /* Make sure ACS will be enabled */
1332#endif 1360 pci_request_acs();
1333 } 1361 }
1334} 1362}
1335 1363
@@ -1350,10 +1378,6 @@ static int __init parse_amd_iommu_dump(char *str)
1350static int __init parse_amd_iommu_options(char *str) 1378static int __init parse_amd_iommu_options(char *str)
1351{ 1379{
1352 for (; *str; ++str) { 1380 for (; *str; ++str) {
1353 if (strncmp(str, "isolate", 7) == 0)
1354 amd_iommu_isolate = true;
1355 if (strncmp(str, "share", 5) == 0)
1356 amd_iommu_isolate = false;
1357 if (strncmp(str, "fullflush", 9) == 0) 1381 if (strncmp(str, "fullflush", 9) == 0)
1358 amd_iommu_unmap_flush = true; 1382 amd_iommu_unmap_flush = true;
1359 } 1383 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..3704997e8b25 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
33int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
@@ -279,7 +280,8 @@ void __init early_gart_iommu_check(void)
279 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
280 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
281 */ 282 */
282 int i, fix, slot; 283 u32 agp_aper_base = 0, agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0;
283 u32 ctl; 285 u32 ctl;
284 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
285 u64 aper_base = 0, last_aper_base = 0; 287 u64 aper_base = 0, last_aper_base = 0;
@@ -289,6 +291,8 @@ void __init early_gart_iommu_check(void)
289 return; 291 return;
290 292
291 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
295
292 fix = 0; 296 fix = 0;
293 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
294 int bus; 298 int bus;
@@ -341,10 +345,10 @@ void __init early_gart_iommu_check(void)
341 } 345 }
342 } 346 }
343 347
344 if (!fix) 348 if (valid_agp)
345 return; 349 return;
346 350
347 /* different nodes have different setting, disable them all at first*/ 351 /* disable them all at first */
348 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
349 int bus; 353 int bus;
350 int dev_base, dev_limit; 354 int dev_base, dev_limit;
@@ -400,6 +404,7 @@ void __init gart_iommu_hole_init(void)
400 404
401 iommu_detected = 1; 405 iommu_detected = 1;
402 gart_iommu_aperture = 1; 406 gart_iommu_aperture = 1;
407 x86_init.iommu.iommu_init = gart_iommu_init;
403 408
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 409 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 410 aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,8 +461,6 @@ out:
456 461
457 if (aper_alloc) { 462 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 463 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) {
460 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 464 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 465 force_iommu ||
463 valid_agp || 466 valid_agp ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..aa57c079c98f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -241,28 +241,13 @@ static int modern_apic(void)
241} 241}
242 242
243/* 243/*
244 * bare function to substitute write operation 244 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 245 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 246 */
262void apic_disable(void) 247void apic_disable(void)
263{ 248{
264 apic->read = native_apic_read_dummy; 249 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 250 apic = &apic_noop;
266} 251}
267 252
268void native_apic_wait_icr_idle(void) 253void native_apic_wait_icr_idle(void)
@@ -459,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 444 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 445 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 446 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 447 apic_write(APIC_TMICT, 0);
463 break; 448 break;
464 case CLOCK_EVT_MODE_RESUME: 449 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 450 /* Nothing to do here */
@@ -662,7 +647,7 @@ static int __init calibrate_APIC_clock(void)
662 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 647 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
663 648
664 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 649 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
665 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 650 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
666 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 651 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
667 calibration_result); 652 calibration_result);
668 653
@@ -1356,7 +1341,7 @@ void enable_x2apic(void)
1356 1341
1357 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1342 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1358 if (!(msr & X2APIC_ENABLE)) { 1343 if (!(msr & X2APIC_ENABLE)) {
1359 pr_info("Enabling x2apic\n"); 1344 printk_once(KERN_INFO "Enabling x2apic\n");
1360 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1345 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1361 } 1346 }
1362} 1347}
@@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1377 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1378 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1379 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1380 int dmar_table_init_ret;
1396 1381
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1382 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1383 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1384 return;
1401 dmar_table_init_ret);
1402#endif
1403 1385
1404 ioapic_entries = alloc_ioapic_entries(); 1386 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1387 if (!ioapic_entries) {
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..eacbd2b31d27 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -306,10 +306,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
306 if (cpumask_test_cpu(cpu, cpu_online_mask)) 306 if (cpumask_test_cpu(cpu, cpu_online_mask))
307 break; 307 break;
308 } 308 }
309 if (cpu < nr_cpu_ids) 309 return per_cpu(x86_cpu_to_apicid, cpu);
310 return per_cpu(x86_cpu_to_apicid, cpu);
311
312 return BAD_APICID;
313} 310}
314 311
315struct apic apic_physflat = { 312struct apic apic_physflat = {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..e31b9ffe25f5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..cb804c5091b9 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -136,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 131 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 132 break;
138 } 133 }
139 if (cpu < nr_cpu_ids) 134 return bigsmp_cpu_to_logical_apicid(cpu);
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143} 135}
144 136
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -230,7 +222,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 222 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 225 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 226 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 227 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 228 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..dd2b5f264643 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
27 * 27 *
28 * http://www.unisys.com 28 * http://www.unisys.com
29 */ 29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
30#include <linux/notifier.h> 33#include <linux/notifier.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/cpumask.h> 35#include <linux/cpumask.h>
@@ -223,9 +226,9 @@ static int parse_unisys_oem(char *oemptr)
223 mip_addr = val; 226 mip_addr = val;
224 mip = (struct mip_reg *)val; 227 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip); 228 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", 229 pr_debug("host_reg = 0x%lx\n",
227 (unsigned long)host_reg); 230 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", 231 pr_debug("mip_reg = 0x%lx\n",
229 (unsigned long)mip_reg); 232 (unsigned long)mip_reg);
230 success++; 233 success++;
231 break; 234 break;
@@ -401,7 +404,7 @@ static void es7000_enable_apic_mode(void)
401 if (!es7000_plat) 404 if (!es7000_plat)
402 return; 405 return;
403 406
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); 407 pr_info("Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 408 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC; 409 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID; 410 es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -466,11 +469,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 469 return cpumask_of(smp_processor_id());
467} 470}
468 471
469static unsigned long 472static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 473{
472 return 0; 474 return 0;
473} 475}
476
474static unsigned long es7000_check_apicid_present(int bit) 477static unsigned long es7000_check_apicid_present(int bit)
475{ 478{
476 return physid_isset(bit, phys_cpu_present_map); 479 return physid_isset(bit, phys_cpu_present_map);
@@ -514,8 +517,7 @@ static void es7000_setup_apic_routing(void)
514{ 517{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 518 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516 519
517 printk(KERN_INFO 520 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ? 521 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster", 522 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 523 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
@@ -539,14 +541,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 541
540static int cpu_id; 542static int cpu_id;
541 543
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 544static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 545{
544 physid_mask_t mask; 546 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 547 ++cpu_id;
548
549 return mask;
550} 548}
551 549
552/* Mapping from cpu number to logical apicid */ 550/* Mapping from cpu number to logical apicid */
@@ -561,10 +559,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 559#endif
562} 560}
563 561
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 562static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 563{
566 /* For clustered we don't have a good way to do this yet - hack */ 564 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 565 physids_promote(0xFFL, retmap);
568} 566}
569 567
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 568static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..de00c4619a55 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h> 62#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 63
66#include <asm/apic.h> 64#include <asm/apic.h>
67 65
@@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 138 return pin;
141} 139}
142 140
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 142#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 143static struct irq_cfg irq_cfgx[] = {
@@ -209,7 +193,7 @@ int __init arch_early_irq_init(void)
209} 193}
210 194
211#ifdef CONFIG_SPARSE_IRQ 195#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 196struct irq_cfg *irq_cfg(unsigned int irq)
213{ 197{
214 struct irq_cfg *cfg = NULL; 198 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 199 struct irq_desc *desc;
@@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 345/* end for move_irq_desc */
362 346
363#else 347#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 348struct irq_cfg *irq_cfg(unsigned int irq)
365{ 349{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 350 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 351}
@@ -555,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 539 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 540}
557 541
542static void __io_apic_modify_irq(struct irq_pin_list *entry,
543 int mask_and, int mask_or,
544 void (*final)(struct irq_pin_list *entry))
545{
546 unsigned int reg, pin;
547
548 pin = entry->pin;
549 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
550 reg &= mask_and;
551 reg |= mask_or;
552 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
553 if (final)
554 final(entry);
555}
556
558static void io_apic_modify_irq(struct irq_cfg *cfg, 557static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 558 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 559 void (*final)(struct irq_pin_list *entry))
561{ 560{
562 int pin;
563 struct irq_pin_list *entry; 561 struct irq_pin_list *entry;
564 562
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 563 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 564 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 565}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 566
569 reg &= mask_and; 567static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 568{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 569 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 570 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 571}
574 } 572
573static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
574{
575 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
576 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 577}
576 578
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 579static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 598}
597 599
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 600static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 601{
612 struct irq_cfg *cfg = desc->chip_data; 602 struct irq_cfg *cfg = desc->chip_data;
@@ -1177,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1177 int cpu, err; 1167 int cpu, err;
1178 cpumask_var_t tmp_mask; 1168 cpumask_var_t tmp_mask;
1179 1169
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1170 if (cfg->move_in_progress)
1181 return -EBUSY; 1171 return -EBUSY;
1182 1172
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1173 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1237,8 +1227,7 @@ next:
1237 return err; 1227 return err;
1238} 1228}
1239 1229
1240static int 1230int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1231{
1243 int err; 1232 int err;
1244 unsigned long flags; 1233 unsigned long flags;
@@ -1599,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1588 struct irq_desc *desc;
1600 unsigned int irq; 1589 unsigned int irq;
1601 1590
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1591 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1592 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1593 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1708,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1694{
1709 int i; 1695 int i;
1710 1696
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1697 printk(KERN_DEBUG);
1715 1698
1716 for (i = 0; i < 8; i++) 1699 for (i = 0; i < 8; i++)
@@ -1724,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1707 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1708 u64 icr;
1726 1709
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1710 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1711 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1712 v = apic_read(APIC_ID);
@@ -1824,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1804 printk("\n");
1825} 1805}
1826 1806
1827__apicdebuginit(void) print_all_local_APICs(void) 1807__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1808{
1829 int cpu; 1809 int cpu;
1830 1810
1811 if (!maxcpu)
1812 return;
1813
1831 preempt_disable(); 1814 preempt_disable();
1832 for_each_online_cpu(cpu) 1815 for_each_online_cpu(cpu) {
1816 if (cpu >= maxcpu)
1817 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1818 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1819 }
1834 preempt_enable(); 1820 preempt_enable();
1835} 1821}
1836 1822
@@ -1839,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1825 unsigned int v;
1840 unsigned long flags; 1826 unsigned long flags;
1841 1827
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1828 if (!nr_legacy_irqs)
1843 return; 1829 return;
1844 1830
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1831 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1866,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1852 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1853}
1868 1854
1869__apicdebuginit(int) print_all_ICs(void) 1855static int __initdata show_lapic = 1;
1856static __init int setup_show_lapic(char *arg)
1857{
1858 int num = -1;
1859
1860 if (strcmp(arg, "all") == 0) {
1861 show_lapic = CONFIG_NR_CPUS;
1862 } else {
1863 get_option(&arg, &num);
1864 if (num >= 0)
1865 show_lapic = num;
1866 }
1867
1868 return 1;
1869}
1870__setup("show_lapic=", setup_show_lapic);
1871
1872__apicdebuginit(int) print_ICs(void)
1870{ 1873{
1874 if (apic_verbosity == APIC_QUIET)
1875 return 0;
1876
1871 print_PIC(); 1877 print_PIC();
1872 1878
1873 /* don't print out if apic is not there */ 1879 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1880 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1881 return 0;
1876 1882
1877 print_all_local_APICs(); 1883 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1884 print_IO_APIC();
1879 1885
1880 return 0; 1886 return 0;
1881} 1887}
1882 1888
1883fs_initcall(print_all_ICs); 1889fs_initcall(print_ICs);
1884 1890
1885 1891
1886/* Where if anywhere is the i8259 connect in external int mode */ 1892/* Where if anywhere is the i8259 connect in external int mode */
@@ -2031,7 +2037,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2037 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2038 * circumvent this idiocy regardless.
2033 */ 2039 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2040 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2041
2036 /* 2042 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2043 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2058,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2064 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2065 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2066 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2067 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2068 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2069 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2070 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2079,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2079 mp_ioapics[apic_id].apicid = i;
2074 } else { 2080 } else {
2075 physid_mask_t tmp; 2081 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2082 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2083 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2084 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2085 mp_ioapics[apic_id].apicid);
@@ -2228,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2234 */
2229 2235
2230#ifdef CONFIG_SMP 2236#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2237void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2238{
2233 cpumask_var_t cleanup_mask; 2239 cpumask_var_t cleanup_mask;
2234 2240
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2241 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2242 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2243 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2244 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2245 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2246 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2247 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2248 free_cpumask_var(cleanup_mask);
2247 } 2249 }
@@ -2272,31 +2274,30 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2274 }
2273} 2275}
2274 2276
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2277/*
2279 * Either sets desc->affinity to a valid value, and returns 2278 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2279 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2281 * leaves desc->affinity untouched. 2280 * leaves desc->affinity untouched.
2282 */ 2281 */
2283static unsigned int 2282unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2283set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
2284 unsigned int *dest_id)
2285{ 2285{
2286 struct irq_cfg *cfg; 2286 struct irq_cfg *cfg;
2287 unsigned int irq; 2287 unsigned int irq;
2288 2288
2289 if (!cpumask_intersects(mask, cpu_online_mask)) 2289 if (!cpumask_intersects(mask, cpu_online_mask))
2290 return BAD_APICID; 2290 return -1;
2291 2291
2292 irq = desc->irq; 2292 irq = desc->irq;
2293 cfg = desc->chip_data; 2293 cfg = desc->chip_data;
2294 if (assign_irq_vector(irq, cfg, mask)) 2294 if (assign_irq_vector(irq, cfg, mask))
2295 return BAD_APICID; 2295 return -1;
2296 2296
2297 cpumask_copy(desc->affinity, mask); 2297 cpumask_copy(desc->affinity, mask);
2298 2298
2299 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2299 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2300 return 0;
2300} 2301}
2301 2302
2302static int 2303static int
@@ -2312,12 +2313,11 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2312 cfg = desc->chip_data; 2313 cfg = desc->chip_data;
2313 2314
2314 spin_lock_irqsave(&ioapic_lock, flags); 2315 spin_lock_irqsave(&ioapic_lock, flags);
2315 dest = set_desc_affinity(desc, mask); 2316 ret = set_desc_affinity(desc, mask, &dest);
2316 if (dest != BAD_APICID) { 2317 if (!ret) {
2317 /* Only the high 8 bits are valid. */ 2318 /* Only the high 8 bits are valid. */
2318 dest = SET_APIC_LOGICAL_ID(dest); 2319 dest = SET_APIC_LOGICAL_ID(dest);
2319 __target_IO_APIC_irq(irq, dest, cfg); 2320 __target_IO_APIC_irq(irq, dest, cfg);
2320 ret = 0;
2321 } 2321 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2322 spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2323
@@ -2432,9 +2432,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2432 continue; 2432 continue;
2433 2433
2434 cfg = irq_cfg(irq); 2434 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2435 raw_spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count)
2437 goto unlock;
2438 2436
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2437 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2440 goto unlock; 2438 goto unlock;
@@ -2452,29 +2450,40 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2450 goto unlock;
2453 } 2451 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2452 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2453unlock:
2457 spin_unlock(&desc->lock); 2454 raw_spin_unlock(&desc->lock);
2458 } 2455 }
2459 2456
2460 irq_exit(); 2457 irq_exit();
2461} 2458}
2462 2459
2463static void irq_complete_move(struct irq_desc **descp) 2460static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2461{
2465 struct irq_desc *desc = *descp; 2462 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2463 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2464 unsigned me;
2468 2465
2469 if (likely(!cfg->move_in_progress)) 2466 if (likely(!cfg->move_in_progress))
2470 return; 2467 return;
2471 2468
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2469 me = smp_processor_id();
2474 2470
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2471 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2472 send_cleanup_vector(cfg);
2477} 2473}
2474
2475static void irq_complete_move(struct irq_desc **descp)
2476{
2477 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2478}
2479
2480void irq_force_complete_move(int irq)
2481{
2482 struct irq_desc *desc = irq_to_desc(irq);
2483 struct irq_cfg *cfg = desc->chip_data;
2484
2485 __irq_complete_move(&desc, cfg->vector);
2486}
2478#else 2487#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2488static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2489#endif
@@ -2490,6 +2499,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2499
2491atomic_t irq_mis_count; 2500atomic_t irq_mis_count;
2492 2501
2502/*
2503 * IO-APIC versions below 0x20 don't support EOI register.
2504 * For the record, here is the information about various versions:
2505 * 0Xh 82489DX
2506 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2507 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2508 * 30h-FFh Reserved
2509 *
2510 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2511 * version as 0x2. This is an error with documentation and these ICH chips
2512 * use io-apic's of version 0x20.
2513 *
2514 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2515 * Otherwise, we simulate the EOI message manually by changing the trigger
2516 * mode to edge and then back to level, with RTE being masked during this.
2517*/
2518static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2519{
2520 struct irq_pin_list *entry;
2521
2522 for_each_irq_pin(entry, cfg->irq_2_pin) {
2523 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2524 /*
2525 * Intr-remapping uses pin number as the virtual vector
2526 * in the RTE. Actual vector is programmed in
2527 * intr-remapping table entry. Hence for the io-apic
2528 * EOI we use the pin number.
2529 */
2530 if (irq_remapped(irq))
2531 io_apic_eoi(entry->apic, entry->pin);
2532 else
2533 io_apic_eoi(entry->apic, cfg->vector);
2534 } else {
2535 __mask_and_edge_IO_APIC_irq(entry);
2536 __unmask_and_level_IO_APIC_irq(entry);
2537 }
2538 }
2539}
2540
2541static void eoi_ioapic_irq(struct irq_desc *desc)
2542{
2543 struct irq_cfg *cfg;
2544 unsigned long flags;
2545 unsigned int irq;
2546
2547 irq = desc->irq;
2548 cfg = desc->chip_data;
2549
2550 spin_lock_irqsave(&ioapic_lock, flags);
2551 __eoi_ioapic_irq(irq, cfg);
2552 spin_unlock_irqrestore(&ioapic_lock, flags);
2553}
2554
2493static void ack_apic_level(unsigned int irq) 2555static void ack_apic_level(unsigned int irq)
2494{ 2556{
2495 struct irq_desc *desc = irq_to_desc(irq); 2557 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2587,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2587 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2588 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2589 * The idea is from Manfred Spraul. --macro
2590 *
2591 * Also in the case when cpu goes offline, fixup_irqs() will forward
2592 * any unhandled interrupt on the offlined cpu to the new cpu
2593 * destination that is handling the corresponding interrupt. This
2594 * interrupt forwarding is done via IPI's. Hence, in this case also
2595 * level-triggered io-apic interrupt will be seen as an edge
2596 * interrupt in the IRR. And we can't rely on the cpu's EOI
2597 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2598 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2599 * supporting EOI register, we do an explicit EOI to clear the
2600 * remote IRR and on IO-APIC's which don't have an EOI register,
2601 * we use the above logic (mask+edge followed by unmask+level) from
2602 * Manfred Spraul to clear the remote IRR.
2528 */ 2603 */
2529 cfg = desc->chip_data; 2604 cfg = desc->chip_data;
2530 i = cfg->vector; 2605 i = cfg->vector;
@@ -2536,6 +2611,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2611 */
2537 ack_APIC_irq(); 2612 ack_APIC_irq();
2538 2613
2614 /*
2615 * Tail end of clearing remote IRR bit (either by delivering the EOI
2616 * message via io-apic EOI register write or simulating it using
2617 * mask+edge followed by unnask+level logic) manually when the
2618 * level triggered interrupt is seen as the edge triggered interrupt
2619 * at the cpu.
2620 */
2621 if (!(v & (1 << (i & 0x1f)))) {
2622 atomic_inc(&irq_mis_count);
2623
2624 eoi_ioapic_irq(desc);
2625 }
2626
2539 /* Now we can move and renable the irq */ 2627 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2628 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2629 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2657,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2657 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2658 unmask_IO_APIC_irq_desc(desc);
2571 } 2659 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2660}
2582 2661
2583#ifdef CONFIG_INTR_REMAP 2662#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2663static void ir_ack_apic_edge(unsigned int irq)
2608{ 2664{
2609 ack_APIC_irq(); 2665 ack_APIC_irq();
@@ -3157,6 +3213,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3213 continue;
3158 3214
3159 desc_new = move_irq_desc(desc_new, node); 3215 desc_new = move_irq_desc(desc_new, node);
3216 cfg_new = desc_new->chip_data;
3160 3217
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3218 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3219 irq = new;
@@ -3211,7 +3268,8 @@ void destroy_irq(unsigned int irq)
3211 * MSI message composition 3268 * MSI message composition
3212 */ 3269 */
3213#ifdef CONFIG_PCI_MSI 3270#ifdef CONFIG_PCI_MSI
3214static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3271static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3272 struct msi_msg *msg, u8 hpet_id)
3215{ 3273{
3216 struct irq_cfg *cfg; 3274 struct irq_cfg *cfg;
3217 int err; 3275 int err;
@@ -3245,7 +3303,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3245 irte.dest_id = IRTE_DEST(dest); 3303 irte.dest_id = IRTE_DEST(dest);
3246 3304
3247 /* Set source-id of interrupt request */ 3305 /* Set source-id of interrupt request */
3248 set_msi_sid(&irte, pdev); 3306 if (pdev)
3307 set_msi_sid(&irte, pdev);
3308 else
3309 set_hpet_sid(&irte, hpet_id);
3249 3310
3250 modify_irte(irq, &irte); 3311 modify_irte(irq, &irte);
3251 3312
@@ -3291,8 +3352,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3291 struct msi_msg msg; 3352 struct msi_msg msg;
3292 unsigned int dest; 3353 unsigned int dest;
3293 3354
3294 dest = set_desc_affinity(desc, mask); 3355 if (set_desc_affinity(desc, mask, &dest))
3295 if (dest == BAD_APICID)
3296 return -1; 3356 return -1;
3297 3357
3298 cfg = desc->chip_data; 3358 cfg = desc->chip_data;
@@ -3324,8 +3384,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3324 if (get_irte(irq, &irte)) 3384 if (get_irte(irq, &irte))
3325 return -1; 3385 return -1;
3326 3386
3327 dest = set_desc_affinity(desc, mask); 3387 if (set_desc_affinity(desc, mask, &dest))
3328 if (dest == BAD_APICID)
3329 return -1; 3388 return -1;
3330 3389
3331 irte.vector = cfg->vector; 3390 irte.vector = cfg->vector;
@@ -3410,7 +3469,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3410 int ret; 3469 int ret;
3411 struct msi_msg msg; 3470 struct msi_msg msg;
3412 3471
3413 ret = msi_compose_msg(dev, irq, &msg); 3472 ret = msi_compose_msg(dev, irq, &msg, -1);
3414 if (ret < 0) 3473 if (ret < 0)
3415 return ret; 3474 return ret;
3416 3475
@@ -3507,8 +3566,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3507 struct msi_msg msg; 3566 struct msi_msg msg;
3508 unsigned int dest; 3567 unsigned int dest;
3509 3568
3510 dest = set_desc_affinity(desc, mask); 3569 if (set_desc_affinity(desc, mask, &dest))
3511 if (dest == BAD_APICID)
3512 return -1; 3570 return -1;
3513 3571
3514 cfg = desc->chip_data; 3572 cfg = desc->chip_data;
@@ -3543,7 +3601,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3543 int ret; 3601 int ret;
3544 struct msi_msg msg; 3602 struct msi_msg msg;
3545 3603
3546 ret = msi_compose_msg(NULL, irq, &msg); 3604 ret = msi_compose_msg(NULL, irq, &msg, -1);
3547 if (ret < 0) 3605 if (ret < 0)
3548 return ret; 3606 return ret;
3549 dmar_msi_write(irq, &msg); 3607 dmar_msi_write(irq, &msg);
@@ -3563,8 +3621,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3563 struct msi_msg msg; 3621 struct msi_msg msg;
3564 unsigned int dest; 3622 unsigned int dest;
3565 3623
3566 dest = set_desc_affinity(desc, mask); 3624 if (set_desc_affinity(desc, mask, &dest))
3567 if (dest == BAD_APICID)
3568 return -1; 3625 return -1;
3569 3626
3570 cfg = desc->chip_data; 3627 cfg = desc->chip_data;
@@ -3583,6 +3640,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3583 3640
3584#endif /* CONFIG_SMP */ 3641#endif /* CONFIG_SMP */
3585 3642
3643static struct irq_chip ir_hpet_msi_type = {
3644 .name = "IR-HPET_MSI",
3645 .unmask = hpet_msi_unmask,
3646 .mask = hpet_msi_mask,
3647#ifdef CONFIG_INTR_REMAP
3648 .ack = ir_ack_apic_edge,
3649#ifdef CONFIG_SMP
3650 .set_affinity = ir_set_msi_irq_affinity,
3651#endif
3652#endif
3653 .retrigger = ioapic_retrigger_irq,
3654};
3655
3586static struct irq_chip hpet_msi_type = { 3656static struct irq_chip hpet_msi_type = {
3587 .name = "HPET_MSI", 3657 .name = "HPET_MSI",
3588 .unmask = hpet_msi_unmask, 3658 .unmask = hpet_msi_unmask,
@@ -3594,20 +3664,36 @@ static struct irq_chip hpet_msi_type = {
3594 .retrigger = ioapic_retrigger_irq, 3664 .retrigger = ioapic_retrigger_irq,
3595}; 3665};
3596 3666
3597int arch_setup_hpet_msi(unsigned int irq) 3667int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3598{ 3668{
3599 int ret; 3669 int ret;
3600 struct msi_msg msg; 3670 struct msi_msg msg;
3601 struct irq_desc *desc = irq_to_desc(irq); 3671 struct irq_desc *desc = irq_to_desc(irq);
3602 3672
3603 ret = msi_compose_msg(NULL, irq, &msg); 3673 if (intr_remapping_enabled) {
3674 struct intel_iommu *iommu = map_hpet_to_ir(id);
3675 int index;
3676
3677 if (!iommu)
3678 return -1;
3679
3680 index = alloc_irte(iommu, irq, 1);
3681 if (index < 0)
3682 return -1;
3683 }
3684
3685 ret = msi_compose_msg(NULL, irq, &msg, id);
3604 if (ret < 0) 3686 if (ret < 0)
3605 return ret; 3687 return ret;
3606 3688
3607 hpet_msi_write(irq, &msg); 3689 hpet_msi_write(irq, &msg);
3608 desc->status |= IRQ_MOVE_PCNTXT; 3690 desc->status |= IRQ_MOVE_PCNTXT;
3609 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3691 if (irq_remapped(irq))
3610 "edge"); 3692 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3693 handle_edge_irq, "edge");
3694 else
3695 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3696 handle_edge_irq, "edge");
3611 3697
3612 return 0; 3698 return 0;
3613} 3699}
@@ -3641,8 +3727,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3641 struct irq_cfg *cfg; 3727 struct irq_cfg *cfg;
3642 unsigned int dest; 3728 unsigned int dest;
3643 3729
3644 dest = set_desc_affinity(desc, mask); 3730 if (set_desc_affinity(desc, mask, &dest))
3645 if (dest == BAD_APICID)
3646 return -1; 3731 return -1;
3647 3732
3648 cfg = desc->chip_data; 3733 cfg = desc->chip_data;
@@ -3708,75 +3793,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3793}
3709#endif /* CONFIG_HT_IRQ */ 3794#endif /* CONFIG_HT_IRQ */
3710 3795
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3796int __init io_apic_get_redir_entries (int ioapic)
3781{ 3797{
3782 union IO_APIC_reg_01 reg_01; 3798 union IO_APIC_reg_01 reg_01;
@@ -3944,7 +3960,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 3960 */
3945 3961
3946 if (physids_empty(apic_id_map)) 3962 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 3963 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 3964
3949 spin_lock_irqsave(&ioapic_lock, flags); 3965 spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 3966 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3960,10 +3976,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 3976 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 3977 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 3978 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 3979 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 3980
3965 for (i = 0; i < get_physical_broadcast(); i++) { 3981 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 3982 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 3983 break;
3968 } 3984 }
3969 3985
@@ -3976,7 +3992,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 3992 apic_id = i;
3977 } 3993 }
3978 3994
3979 tmp = apic->apicid_to_cpu_present(apic_id); 3995 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 3996 physids_or(apic_id_map, apic_id_map, tmp);
3981 3997
3982 if (reg_00.bits.ID != apic_id) { 3998 if (reg_00.bits.ID != apic_id) {
@@ -4106,7 +4122,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4122 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4123 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4124 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4125 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4126 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4127 }
4112 4128
@@ -4140,18 +4156,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4156#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4157fake_ioapic_page:
4142#endif 4158#endif
4143 ioapic_phys = (unsigned long) 4159 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4160 ioapic_phys = __pa(ioapic_phys);
4146 } 4161 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4162 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4163 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4164 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4165 ioapic_phys);
4151 idx++; 4166 idx++;
4152 4167
4153 ioapic_res->start = ioapic_phys; 4168 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4169 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4170 ioapic_res++;
4156 } 4171 }
4157} 4172}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 45404379d173..4ada42c3dabb 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,8 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask __read_mostly; 42/* For reliability, we're prepared to waste bits here. */
43static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 44
44/* nmi_active: 45/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 415 }
415 416
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 417 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
419 420
420 spin_lock(&lock); 421 spin_lock(&lock);
@@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
422 show_regs(regs); 423 show_regs(regs);
423 dump_stack(); 424 dump_stack();
424 spin_unlock(&lock); 425 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 427
427 rc = 1; 428 rc = 1;
428 } 429 }
@@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 559{
559 int i; 560 int i;
560 561
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 562 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 563
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 564 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 565 apic->send_IPI_all(NMI_VECTOR);
565 566
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 567 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 568 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 569 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 570 break;
570 mdelay(1); 571 mdelay(1);
571 } 572 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2b8505..98c4665f251c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
@@ -334,10 +329,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
334 return cpu_all_mask; 329 return cpu_all_mask;
335} 330}
336 331
337static inline unsigned long 332static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
338numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
339{ 333{
340 return physid_isset(apicid, bitmap); 334 return physid_isset(apicid, *map);
341} 335}
342 336
343static inline unsigned long numaq_check_apicid_present(int bit) 337static inline unsigned long numaq_check_apicid_present(int bit)
@@ -371,10 +365,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
371 return apic != 0 && irq == 0; 365 return apic != 0 && irq == 0;
372} 366}
373 367
374static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 368static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
375{ 369{
376 /* We don't have a good way to do this yet - hack */ 370 /* We don't have a good way to do this yet - hack */
377 return physids_promote(0xFUL); 371 return physids_promote(0xFUL, retmap);
378} 372}
379 373
380static inline int numaq_cpu_to_logical_apicid(int cpu) 374static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -402,12 +396,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
402 return logical_apicid >> 4; 396 return logical_apicid >> 4;
403} 397}
404 398
405static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 399static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
406{ 400{
407 int node = numaq_apicid_to_node(logical_apicid); 401 int node = numaq_apicid_to_node(logical_apicid);
408 int cpu = __ffs(logical_apicid & 0xf); 402 int cpu = __ffs(logical_apicid & 0xf);
409 403
410 return physid_mask_of_physid(cpu + 4*node); 404 physid_set_mask_of_physid(cpu + 4*node, retmap);
411} 405}
412 406
413/* Where the IO area was mapped on multiquad, always 0 otherwise */ 407/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..1a6559f6768c 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -108,7 +108,7 @@ struct apic apic_default = {
108 .apicid_to_node = default_apicid_to_node, 108 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 110 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 111 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 112 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 113 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 114 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..cf69c59f4910 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
148 break; 148 break;
149 } 149 }
150 150
151 if (cpu < nr_cpu_ids) 151 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152 return per_cpu(x86_cpu_to_logical_apicid, cpu);
153
154 return BAD_APICID;
155} 152}
156 153
157static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..8972f38c5ced 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
146 break; 146 break;
147 } 147 }
148 148
149 if (cpu < nr_cpu_ids) 149 return per_cpu(x86_cpu_to_apicid, cpu);
150 return per_cpu(x86_cpu_to_apicid, cpu);
151
152 return BAD_APICID;
153} 150}
154 151
155static unsigned int x2apic_phys_get_apic_id(unsigned long x) 152static unsigned int x2apic_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f5f5886a6b53..5f92494dab61 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -30,10 +30,22 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/ipi.h> 31#include <asm/ipi.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/x86_init.h>
33 34
34DEFINE_PER_CPU(int, x2apic_extra_bits); 35DEFINE_PER_CPU(int, x2apic_extra_bits);
35 36
36static enum uv_system_type uv_system_type; 37static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr;
39
40static inline bool is_GRU_range(u64 start, u64 end)
41{
42 return start >= gru_start_paddr && end <= gru_end_paddr;
43}
44
45static bool uv_is_untracked_pat_range(u64 start, u64 end)
46{
47 return is_ISA_range(start, end) || is_GRU_range(start, end);
48}
37 49
38static int early_get_nodeid(void) 50static int early_get_nodeid(void)
39{ 51{
@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 62{
51 if (!strcmp(oem_id, "SGI")) { 63 if (!strcmp(oem_id, "SGI")) {
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
52 if (!strcmp(oem_table_id, "UVL")) 65 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 66 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 67 else if (!strcmp(oem_table_id, "UVX"))
@@ -212,10 +225,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
212 if (cpumask_test_cpu(cpu, cpu_online_mask)) 225 if (cpumask_test_cpu(cpu, cpu_online_mask))
213 break; 226 break;
214 } 227 }
215 if (cpu < nr_cpu_ids) 228 return per_cpu(x86_cpu_to_apicid, cpu);
216 return per_cpu(x86_cpu_to_apicid, cpu);
217
218 return BAD_APICID;
219} 229}
220 230
221static unsigned int x2apic_get_apic_id(unsigned long x) 231static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -352,14 +362,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
352 362
353 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { 363 for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
354 alias.v = uv_read_local_mmr(redir_addrs[i].alias); 364 alias.v = uv_read_local_mmr(redir_addrs[i].alias);
355 if (alias.s.base == 0) { 365 if (alias.s.enable && alias.s.base == 0) {
356 *size = (1UL << alias.s.m_alias); 366 *size = (1UL << alias.s.m_alias);
357 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); 367 redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
358 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; 368 *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
359 return; 369 return;
360 } 370 }
361 } 371 }
362 BUG(); 372 *base = *size = 0;
363} 373}
364 374
365enum map_type {map_wb, map_uc}; 375enum map_type {map_wb, map_uc};
@@ -385,8 +395,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 395 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 396
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 397 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 398 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 399 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
400 gru_start_paddr = ((u64)gru.s.base << shift);
401 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
402
403 }
390} 404}
391 405
392static __init void map_mmr_high(int max_pnode) 406static __init void map_mmr_high(int max_pnode)
@@ -409,6 +423,12 @@ static __init void map_mmioh_high(int max_pnode)
409 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 423 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
410} 424}
411 425
426static __init void map_low_mmrs(void)
427{
428 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
429 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
430}
431
412static __init void uv_rtc_init(void) 432static __init void uv_rtc_init(void)
413{ 433{
414 long status; 434 long status;
@@ -550,6 +570,8 @@ void __init uv_system_init(void)
550 unsigned long mmr_base, present, paddr; 570 unsigned long mmr_base, present, paddr;
551 unsigned short pnode_mask; 571 unsigned short pnode_mask;
552 572
573 map_low_mmrs();
574
553 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 575 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
554 m_val = m_n_config.s.m_skt; 576 m_val = m_n_config.s.m_skt;
555 n_val = m_n_config.s.n_skt; 577 n_val = m_n_config.s.n_skt;
@@ -607,8 +629,10 @@ void __init uv_system_init(void)
607 uv_rtc_init(); 629 uv_rtc_init();
608 630
609 for_each_present_cpu(cpu) { 631 for_each_present_cpu(cpu) {
632 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
633
610 nid = cpu_to_node(cpu); 634 nid = cpu_to_node(cpu);
611 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 635 pnode = uv_apicid_to_pnode(apicid);
612 blade = boot_pnode_to_blade(pnode); 636 blade = boot_pnode_to_blade(pnode);
613 lcpu = uv_blade_info[blade].nr_possible_cpus; 637 lcpu = uv_blade_info[blade].nr_possible_cpus;
614 uv_blade_info[blade].nr_possible_cpus++; 638 uv_blade_info[blade].nr_possible_cpus++;
@@ -619,25 +643,23 @@ void __init uv_system_init(void)
619 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 643 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
620 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; 644 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
621 uv_cpu_hub_info(cpu)->m_val = m_val; 645 uv_cpu_hub_info(cpu)->m_val = m_val;
622 uv_cpu_hub_info(cpu)->n_val = m_val; 646 uv_cpu_hub_info(cpu)->n_val = n_val;
623 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 647 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
624 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 648 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
625 uv_cpu_hub_info(cpu)->pnode = pnode; 649 uv_cpu_hub_info(cpu)->pnode = pnode;
626 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 650 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
627 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 651 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
628 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 652 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
629 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 653 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
630 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 654 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
631 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 655 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
632 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 656 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
633 uv_node_to_blade[nid] = blade; 657 uv_node_to_blade[nid] = blade;
634 uv_cpu_to_blade[cpu] = blade; 658 uv_cpu_to_blade[cpu] = blade;
635 max_pnode = max(pnode, max_pnode); 659 max_pnode = max(pnode, max_pnode);
636 660
637 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 661 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
638 "lcpu %d, blade %d\n", 662 cpu, apicid, pnode, nid, lcpu, blade);
639 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
640 lcpu, blade);
641 } 663 }
642 664
643 /* Add blade/pnode info for nodes without cpus */ 665 /* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..b5b6b23bce53 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 63a88e1f987d..b0206a211b09 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -101,21 +101,17 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
101} 101}
102 102
103int 103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, 104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset) 105 unsigned long *intr_mmr_offset)
106{ 106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist; 107 u64 watchlist;
109 s64 ret; 108 s64 ret;
110 109
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /* 110 /*
115 * bios returns watchlist number or negative error number. 111 * bios returns watchlist number or negative error number.
116 */ 112 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, 113 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset, 114 mq_size, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0); 115 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS) 116 if (ret < BIOS_STATUS_SUCCESS)
121 return ret; 117 return ret;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..1d2cb383410e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c965e5212714..468489b57aae 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -74,6 +74,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
74 unsigned int eax, ebx, ecx, edx, sub_index; 74 unsigned int eax, ebx, ecx, edx, sub_index;
75 unsigned int ht_mask_width, core_plus_mask_width; 75 unsigned int ht_mask_width, core_plus_mask_width;
76 unsigned int core_select_mask, core_level_siblings; 76 unsigned int core_select_mask, core_level_siblings;
77 static bool printed;
77 78
78 if (c->cpuid_level < 0xb) 79 if (c->cpuid_level < 0xb)
79 return; 80 return;
@@ -127,12 +128,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
127 128
128 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 129 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
129 130
130 131 if (!printed) {
131 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 132 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
132 c->phys_proc_id); 133 c->phys_proc_id);
133 if (c->x86_max_cores > 1) 134 if (c->x86_max_cores > 1)
134 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 135 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
135 c->cpu_core_id); 136 c->cpu_core_id);
137 printed = 1;
138 }
136 return; 139 return;
137#endif 140#endif
138} 141}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..e485825130d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid)
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same. 257 * Assumption: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */ 258 */
261#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{ 261{
264#ifdef CONFIG_PCI 262 unsigned long long value;
265 u32 t, cpn; 263 u32 nodes, cores_per_node;
266 u8 n, n_id;
267 int cpu = smp_processor_id(); 264 int cpu = smp_processor_id();
268 265
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
267 return;
268
269 /* fixup topology information only once for a core */ 269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return; 271 return;
272 272
273 /* check for multi-node processor on boot cpu */ 273 rdmsrl(MSR_FAM10H_NODE_ID, value);
274 t = read_pci_config(0, 24, 3, 0xe8); 274
275 if (!(t & (1 << 29))) 275 nodes = ((value >> 3) & 7) + 1;
276 if (nodes == 1)
276 return; 277 return;
277 278
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 279 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes;
279 281
280 /* cores per node: each internal node has half the number of cores */ 282 /* store NodeID, use llc_shared_map to store sibling info */
281 cpn = c->x86_max_cores >> 1; 283 per_cpu(cpu_llc_id, cpu) = value & 7;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306 284
307 /* fixup core id to be in range from 0 to cpn */ 285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */
308 c->cpu_core_id = c->cpu_core_id % cpn; 286 c->cpu_core_id = c->cpu_core_id % cores_per_node;
309#endif
310} 287}
311#endif 288#endif
312 289
@@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
375 node = nearby_node(apicid); 352 node = nearby_node(apicid);
376 } 353 }
377 numa_set_node(cpu, node); 354 numa_set_node(cpu, node);
378
379 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
380#endif 355#endif
381} 356}
382 357
@@ -535,7 +510,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 510 }
536 } 511 }
537 512
538 display_cacheinfo(c); 513 cpu_detect_cache_sizes(c);
539 514
540 /* Multi core CPU? */ 515 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 516 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3192f22f2fdd..4868e4a951ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -432,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
432#ifdef CONFIG_X86_HT 427#ifdef CONFIG_X86_HT
433 u32 eax, ebx, ecx, edx; 428 u32 eax, ebx, ecx, edx;
434 int index_msb, core_bits; 429 int index_msb, core_bits;
430 static bool printed;
435 431
436 if (!cpu_has(c, X86_FEATURE_HT)) 432 if (!cpu_has(c, X86_FEATURE_HT))
437 return; 433 return;
@@ -447,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
447 smp_num_siblings = (ebx & 0xff0000) >> 16; 443 smp_num_siblings = (ebx & 0xff0000) >> 16;
448 444
449 if (smp_num_siblings == 1) { 445 if (smp_num_siblings == 1) {
450 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 446 printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
451 goto out; 447 goto out;
452 } 448 }
453 449
@@ -474,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
474 ((1 << core_bits) - 1); 470 ((1 << core_bits) - 1);
475 471
476out: 472out:
477 if ((c->x86_max_cores * smp_num_siblings) > 1) { 473 if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
478 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 474 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
479 c->phys_proc_id); 475 c->phys_proc_id);
480 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 476 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
481 c->cpu_core_id); 477 c->cpu_core_id);
478 printed = 1;
482 } 479 }
483#endif 480#endif
484} 481}
@@ -659,24 +656,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 656 const struct cpu_dev *const *cdev;
660 int count = 0; 657 int count = 0;
661 658
659#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 660 printk(KERN_INFO "KERNEL supported cpus:\n");
661#endif
662
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 664 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 665
667 if (count >= X86_VENDOR_NUM) 666 if (count >= X86_VENDOR_NUM)
668 break; 667 break;
669 cpu_devs[count] = cpudev; 668 cpu_devs[count] = cpudev;
670 count++; 669 count++;
671 670
672 for (j = 0; j < 2; j++) { 671#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 672 {
674 continue; 673 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 674
676 cpudev->c_ident[j]); 675 for (j = 0; j < 2; j++) {
676 if (!cpudev->c_ident[j])
677 continue;
678 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
679 cpudev->c_ident[j]);
680 }
677 } 681 }
682#endif
678 } 683 }
679
680 early_identify_cpu(&boot_cpu_data); 684 early_identify_cpu(&boot_cpu_data);
681} 685}
682 686
@@ -837,10 +841,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 841 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 842 }
839 843
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 844 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 845 mcheck_cpu_init(c);
843#endif
844 846
845 select_idle_routine(c); 847 select_idle_routine(c);
846 848
@@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void)
1115 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) 1117 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1116 panic("CPU#%d already initialized!\n", cpu); 1118 panic("CPU#%d already initialized!\n", cpu);
1117 1119
1118 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1120 pr_debug("Initializing CPU#%d\n", cpu);
1119 1121
1120 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1122 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1121 1123
@@ -1136,7 +1138,7 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1138 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1139 barrier();
1138 1140
1139 check_efer(); 1141 x86_configure_nx();
1140 if (cpu != 0) 1142 if (cpu != 0)
1141 enable_x2apic(); 1143 enable_x2apic();
1142 1144
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 43eb3465dda7..1b1920fa7c80 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -190,9 +190,11 @@ static void do_drv_write(void *_cmd)
190 190
191static void drv_read(struct drv_cmd *cmd) 191static void drv_read(struct drv_cmd *cmd)
192{ 192{
193 int err;
193 cmd->val = 0; 194 cmd->val = 0;
194 195
195 smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); 196 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
197 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
196} 198}
197 199
198static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
@@ -526,15 +528,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
526 528
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) 529static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{ 530{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf 531 /* Intel Xeon Processor 7100 Series Specification Update
532 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an 533 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause 534 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/ 535 * Both Processor Cores to Lock Up. */
533 if (c->x86_vendor == X86_VENDOR_INTEL) { 536 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) && 537 if ((c->x86 == 15) &&
535 (c->x86_model == 6) && 538 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable()) 539 (c->x86_mask == 8)) {
540 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
541 "Xeon(R) 7100 Errata AL30, processors may "
542 "lock up on frequency changes: disabling "
543 "acpi-cpufreq.\n");
537 return -ENODEV; 544 return -ENODEV;
545 }
538 } 546 }
539 return 0; 547 return 0;
540} 548}
@@ -549,13 +557,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
549 unsigned int result = 0; 557 unsigned int result = 0;
550 struct cpuinfo_x86 *c = &cpu_data(policy->cpu); 558 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
551 struct acpi_processor_performance *perf; 559 struct acpi_processor_performance *perf;
560#ifdef CONFIG_SMP
561 static int blacklisted;
562#endif
552 563
553 dprintk("acpi_cpufreq_cpu_init\n"); 564 dprintk("acpi_cpufreq_cpu_init\n");
554 565
555#ifdef CONFIG_SMP 566#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c); 567 if (blacklisted)
557 if (result) 568 return blacklisted;
558 return result; 569 blacklisted = acpi_cpufreq_blacklist(c);
570 if (blacklisted)
571 return blacklisted;
559#endif 572#endif
560 573
561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 574 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
@@ -753,14 +766,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
753}; 766};
754 767
755static struct cpufreq_driver acpi_cpufreq_driver = { 768static struct cpufreq_driver acpi_cpufreq_driver = {
756 .verify = acpi_cpufreq_verify, 769 .verify = acpi_cpufreq_verify,
757 .target = acpi_cpufreq_target, 770 .target = acpi_cpufreq_target,
758 .init = acpi_cpufreq_cpu_init, 771 .bios_limit = acpi_processor_get_bios_limit,
759 .exit = acpi_cpufreq_cpu_exit, 772 .init = acpi_cpufreq_cpu_init,
760 .resume = acpi_cpufreq_resume, 773 .exit = acpi_cpufreq_cpu_exit,
761 .name = "acpi-cpufreq", 774 .resume = acpi_cpufreq_resume,
762 .owner = THIS_MODULE, 775 .name = "acpi-cpufreq",
763 .attr = acpi_cpufreq_attr, 776 .owner = THIS_MODULE,
777 .attr = acpi_cpufreq_attr,
764}; 778};
765 779
766static int __init acpi_cpufreq_init(void) 780static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad9..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); 813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break; 814 break;
815 case 1 ... 15: 815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V1; 816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) { 817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2; 818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]"; 819 cpuname = "C3 'Samuel 2' [C5B]";
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
885 885
886 /* Find ACPI data for processor */ 886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, 887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, 888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr); 889 NULL, (void *)&pr);
890 890
891 /* Check ACPI support for C3 state */ 891 /* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..cb01dac267d3 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -164,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
164 } 164 }
165 165
166 /* cpuinfo and default policy values */ 166 /* cpuinfo and default policy values */
167 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 167 policy->cpuinfo.transition_latency = 200000;
168 policy->cur = busfreq * max_multiplier; 168 policy->cur = busfreq * max_multiplier;
169 169
170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
714}; 714};
715 715
716static struct cpufreq_driver powernow_driver = { 716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify, 717 .verify = powernow_verify,
718 .target = powernow_target, 718 .target = powernow_target,
719 .get = powernow_get, 719 .get = powernow_get,
720 .init = powernow_cpu_init, 720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .exit = powernow_cpu_exit, 721 .bios_limit = acpi_processor_get_bios_limit,
722 .name = "powernow-k7", 722#endif
723 .owner = THIS_MODULE, 723 .init = powernow_cpu_init,
724 .attr = powernow_table_attr, 724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
725}; 728};
726 729
727static int __init powernow_init(void) 730static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6394aa5c7985..f125e5c551c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
1022 * set it to 1 to avoid problems in the future. 1022 * set it to 1 to avoid problems in the future.
1023 * For all others it's a BIOS bug. 1023 * For all others it's a BIOS bug.
1024 */ 1024 */
1025 if (!boot_cpu_data.x86 == 0x11) 1025 if (boot_cpu_data.x86 != 0x11)
1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1027 "latency\n"); 1027 "latency\n");
1028 max_latency = 1; 1028 max_latency = 1;
@@ -1118,7 +1118,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1118static int powernowk8_target(struct cpufreq_policy *pol, 1118static int powernowk8_target(struct cpufreq_policy *pol,
1119 unsigned targfreq, unsigned relation) 1119 unsigned targfreq, unsigned relation)
1120{ 1120{
1121 cpumask_t oldmask; 1121 cpumask_var_t oldmask;
1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1123 u32 checkfid; 1123 u32 checkfid;
1124 u32 checkvid; 1124 u32 checkvid;
@@ -1131,9 +1131,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1131 checkfid = data->currfid; 1131 checkfid = data->currfid;
1132 checkvid = data->currvid; 1132 checkvid = data->currvid;
1133 1133
1134 /* only run on specific CPU from here on */ 1134 /* only run on specific CPU from here on. */
1135 oldmask = current->cpus_allowed; 1135 /* This is poor form: use a workqueue or smp_call_function_single */
1136 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1136 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1137 return -ENOMEM;
1138
1139 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1140 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1137 1141
1138 if (smp_processor_id() != pol->cpu) { 1142 if (smp_processor_id() != pol->cpu) {
1139 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1143 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1197,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1193 ret = 0; 1197 ret = 0;
1194 1198
1195err_out: 1199err_out:
1196 set_cpus_allowed_ptr(current, &oldmask); 1200 set_cpus_allowed_ptr(current, oldmask);
1201 free_cpumask_var(oldmask);
1197 return ret; 1202 return ret;
1198} 1203}
1199 1204
@@ -1393,14 +1398,15 @@ static struct freq_attr *powernow_k8_attr[] = {
1393}; 1398};
1394 1399
1395static struct cpufreq_driver cpufreq_amd64_driver = { 1400static struct cpufreq_driver cpufreq_amd64_driver = {
1396 .verify = powernowk8_verify, 1401 .verify = powernowk8_verify,
1397 .target = powernowk8_target, 1402 .target = powernowk8_target,
1398 .init = powernowk8_cpu_init, 1403 .bios_limit = acpi_processor_get_bios_limit,
1399 .exit = __devexit_p(powernowk8_cpu_exit), 1404 .init = powernowk8_cpu_init,
1400 .get = powernowk8_get, 1405 .exit = __devexit_p(powernowk8_cpu_exit),
1401 .name = "powernow-k8", 1406 .get = powernowk8_get,
1402 .owner = THIS_MODULE, 1407 .name = "powernow-k8",
1403 .attr = powernow_k8_attr, 1408 .owner = THIS_MODULE,
1409 .attr = powernow_k8_attr,
1404}; 1410};
1405 1411
1406/* driver entry point for init */ 1412/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f6..2ce8e0b5cc54 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -39,7 +39,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 39
40/* speedstep_processor 40/* speedstep_processor
41 */ 41 */
42static unsigned int speedstep_processor; 42static enum speedstep_processor speedstep_processor;
43 43
44static u32 pmbase; 44static u32 pmbase;
45 45
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
232 return 0; 232 return 0;
233} 233}
234 234
235struct get_freq_data { 235static void get_freq_data(void *_speed)
236 unsigned int speed;
237 unsigned int processor;
238};
239
240static void get_freq_data(void *_data)
241{ 236{
242 struct get_freq_data *data = _data; 237 unsigned int *speed = _speed;
243 238
244 data->speed = speedstep_get_frequency(data->processor); 239 *speed = speedstep_get_frequency(speedstep_processor);
245} 240}
246 241
247static unsigned int speedstep_get(unsigned int cpu) 242static unsigned int speedstep_get(unsigned int cpu)
248{ 243{
249 struct get_freq_data data = { .processor = cpu }; 244 unsigned int speed;
250 245
251 /* You're supposed to ensure CPU is online. */ 246 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) 247 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
253 BUG(); 248 BUG();
254 249
255 dprintk("detected %u kHz as current frequency\n", data.speed); 250 dprintk("detected %u kHz as current frequency\n", speed);
256 return data.speed; 251 return speed;
257} 252}
258 253
259/** 254/**
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..ad0083abfa23 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -34,7 +34,7 @@ static int relaxed_check;
34 * GET PROCESSOR CORE SPEED IN KHZ * 34 * GET PROCESSOR CORE SPEED IN KHZ *
35 *********************************************************************/ 35 *********************************************************************/
36 36
37static unsigned int pentium3_get_frequency(unsigned int processor) 37static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
38{ 38{
39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
40 struct { 40 struct {
@@ -227,7 +227,7 @@ static unsigned int pentium4_get_frequency(void)
227 227
228 228
229/* Warning: may get called from smp_call_function_single. */ 229/* Warning: may get called from smp_call_function_single. */
230unsigned int speedstep_get_frequency(unsigned int processor) 230unsigned int speedstep_get_frequency(enum speedstep_processor processor)
231{ 231{
232 switch (processor) { 232 switch (processor) {
233 case SPEEDSTEP_CPU_PCORE: 233 case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +380,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
380 * DETECT SPEEDSTEP SPEEDS * 380 * DETECT SPEEDSTEP SPEEDS *
381 *********************************************************************/ 381 *********************************************************************/
382 382
383unsigned int speedstep_get_freqs(unsigned int processor, 383unsigned int speedstep_get_freqs(enum speedstep_processor processor,
384 unsigned int *low_speed, 384 unsigned int *low_speed,
385 unsigned int *high_speed, 385 unsigned int *high_speed,
386 unsigned int *transition_latency, 386 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
11 11
12 12
13/* processors */ 13/* processors */
14 14enum speedstep_processor {
15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ 16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ 17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ 18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected 19/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 20 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_frequency() call. */ 21 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ 22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ 23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ 24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -31,10 +31,10 @@
31 31
32 32
33/* detect a speedstep-capable processor */ 33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void); 34extern enum speedstep_processor speedstep_detect_processor(void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
42 * SPEEDSTEP_LOW; the second argument is zero so that no 42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated. 43 * cpufreq_notify_transition calls are initiated.
44 */ 44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor, 45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed, 46 unsigned int *low_speed,
47 unsigned int *high_speed, 47 unsigned int *high_speed,
48 unsigned int *transition_latency, 48 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..04d73c114e49 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -35,7 +35,7 @@ static int smi_cmd;
35static unsigned int smi_sig; 35static unsigned int smi_sig;
36 36
37/* info about the processor */ 37/* info about the processor */
38static unsigned int speedstep_processor; 38static enum speedstep_processor speedstep_processor;
39 39
40/* 40/*
41 * There are only two frequency states for each processor. Values 41 * There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..879666f4d871 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,7 +70,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
74 sched_clock_stable = 1; 73 sched_clock_stable = 1;
75 } 74 }
76 75
@@ -263,11 +262,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 262 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 263 for now. */
265 node = apicid_to_node[apicid]; 264 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 265 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 266 node = first_node(node_online_map);
267 else if (!node_online(node)) {
268 /* reuse the value from init_cpu_to_node() */
269 node = cpu_to_node(cpu);
270 }
268 numa_set_node(cpu, node); 271 numa_set_node(cpu, node);
269
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
271#endif 272#endif
272} 273}
273 274
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index f5ccb4fa5a5d..fc6c8ef92dcc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -94,7 +94,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 97 { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */
@@ -102,6 +102,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
105 { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */
106 { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */
107 { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 108 { 0x00, 0, 0}
106}; 109};
107 110
@@ -488,22 +491,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488#endif 491#endif
489 } 492 }
490 493
491 if (trace)
492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
493 else if (l1i)
494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
495
496 if (l1d)
497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
498 else
499 printk(KERN_CONT "\n");
500
501 if (l2)
502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
503
504 if (l3)
505 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
506
507 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 494 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
508 495
509 return l2; 496 return l2;
@@ -520,18 +507,19 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
520{ 507{
521 struct _cpuid4_info *this_leaf, *sibling_leaf; 508 struct _cpuid4_info *this_leaf, *sibling_leaf;
522 unsigned long num_threads_sharing; 509 unsigned long num_threads_sharing;
523 int index_msb, i; 510 int index_msb, i, sibling;
524 struct cpuinfo_x86 *c = &cpu_data(cpu); 511 struct cpuinfo_x86 *c = &cpu_data(cpu);
525 512
526 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 513 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
527 struct cpuinfo_x86 *d; 514 for_each_cpu(i, c->llc_shared_map) {
528 for_each_online_cpu(i) {
529 if (!per_cpu(ici_cpuid4_info, i)) 515 if (!per_cpu(ici_cpuid4_info, i))
530 continue; 516 continue;
531 d = &cpu_data(i);
532 this_leaf = CPUID4_INFO_IDX(i, index); 517 this_leaf = CPUID4_INFO_IDX(i, index);
533 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), 518 for_each_cpu(sibling, c->llc_shared_map) {
534 d->llc_shared_map); 519 if (!cpu_online(sibling))
520 continue;
521 set_bit(sibling, this_leaf->shared_cpu_map);
522 }
535 } 523 }
536 return; 524 return;
537 } 525 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 472763d92098..73734baa50f2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -74,7 +74,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
74 m->finished = 0; 74 m->finished = 0;
75} 75}
76 76
77static cpumask_t mce_inject_cpumask; 77static cpumask_var_t mce_inject_cpumask;
78 78
79static int mce_raise_notify(struct notifier_block *self, 79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data) 80 unsigned long val, void *data)
@@ -82,9 +82,9 @@ static int mce_raise_notify(struct notifier_block *self,
82 struct die_args *args = (struct die_args *)data; 82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id(); 83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm); 84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) 85 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE; 86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask); 87 cpumask_clear_cpu(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION) 88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs); 89 raise_exception(m, args->regs);
90 else if (m->status) 90 else if (m->status)
@@ -148,22 +148,22 @@ static void raise_mce(struct mce *m)
148 unsigned long start; 148 unsigned long start;
149 int cpu; 149 int cpu;
150 get_online_cpus(); 150 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map; 151 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
152 cpu_clear(get_cpu(), mce_inject_cpumask); 152 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) { 153 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu); 154 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished || 155 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask); 157 cpumask_clear_cpu(cpu, mce_inject_cpumask);
158 } 158 }
159 if (!cpus_empty(mce_inject_cpumask)) 159 if (!cpumask_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); 160 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies; 161 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) { 162 while (!cpumask_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) { 163 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR 164 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n", 165 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask)); 166 *cpumask_bits(mce_inject_cpumask));
167 break; 167 break;
168 } 168 }
169 cpu_relax(); 169 cpu_relax();
@@ -210,6 +210,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
210 210
211static int inject_init(void) 211static int inject_init(void)
212{ 212{
213 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
214 return -ENOMEM;
213 printk(KERN_INFO "Machine check injector initialized\n"); 215 printk(KERN_INFO "Machine check injector initialized\n");
214 mce_chrdev_ops.write = mce_write; 216 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb); 217 register_die_notifier(&mce_raise_nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 183c3457d2f4..a8aacd4b513c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -85,6 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 88static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 89static int cpu_missing;
87 90
91/*
92 * CPU/chipset specific EDAC code can register a notifier call here to print
93 * MCE errors in a human-readable form.
94 */
95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
97
98static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data)
100{
101 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
103
104 return NOTIFY_STOP;
105}
106
107static struct notifier_block mce_dec_nb = {
108 .notifier_call = default_decode_mce,
109 .priority = -1,
110};
88 111
89/* MCA banks polled by the period polling timer for corrected events */ 112/* MCA banks polled by the period polling timer for corrected events */
90DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -129,6 +152,9 @@ void mce_log(struct mce *mce)
129{ 152{
130 unsigned next, entry; 153 unsigned next, entry;
131 154
155 /* Emit the trace record: */
156 trace_mce_record(mce);
157
132 mce->finished = 0; 158 mce->finished = 0;
133 wmb(); 159 wmb();
134 for (;;) { 160 for (;;) {
@@ -165,46 +191,46 @@ void mce_log(struct mce *mce)
165 set_bit(0, &mce_need_notify); 191 set_bit(0, &mce_need_notify);
166} 192}
167 193
168void __weak decode_mce(struct mce *m)
169{
170 return;
171}
172
173static void print_mce(struct mce *m) 194static void print_mce(struct mce *m)
174{ 195{
175 printk(KERN_EMERG 196 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
176 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
177 m->extcpu, m->mcgstatus, m->bank, m->status); 197 m->extcpu, m->mcgstatus, m->bank, m->status);
198
178 if (m->ip) { 199 if (m->ip) {
179 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 200 pr_emerg("RIP%s %02x:<%016Lx> ",
180 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 201 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
181 m->cs, m->ip); 202 m->cs, m->ip);
203
182 if (m->cs == __KERNEL_CS) 204 if (m->cs == __KERNEL_CS)
183 print_symbol("{%s}", m->ip); 205 print_symbol("{%s}", m->ip);
184 printk(KERN_CONT "\n"); 206 pr_cont("\n");
185 } 207 }
186 printk(KERN_EMERG "TSC %llx ", m->tsc); 208
209 pr_emerg("TSC %llx ", m->tsc);
187 if (m->addr) 210 if (m->addr)
188 printk(KERN_CONT "ADDR %llx ", m->addr); 211 pr_cont("ADDR %llx ", m->addr);
189 if (m->misc) 212 if (m->misc)
190 printk(KERN_CONT "MISC %llx ", m->misc); 213 pr_cont("MISC %llx ", m->misc);
191 printk(KERN_CONT "\n"); 214
192 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 215 pr_cont("\n");
193 m->cpuvendor, m->cpuid, m->time, m->socketid, 216 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
194 m->apicid); 217 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
195 218
196 decode_mce(m); 219 /*
220 * Print out human-readable details about the MCE error,
221 * (if the CPU has an implementation for that)
222 */
223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
197} 224}
198 225
199static void print_mce_head(void) 226static void print_mce_head(void)
200{ 227{
201 printk(KERN_EMERG "\nHARDWARE ERROR\n"); 228 pr_emerg("\nHARDWARE ERROR\n");
202} 229}
203 230
204static void print_mce_tail(void) 231static void print_mce_tail(void)
205{ 232{
206 printk(KERN_EMERG "This is not a software problem!\n" 233 pr_emerg("This is not a software problem!\n");
207 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
208} 234}
209 235
210#define PANIC_TIMEOUT 5 /* 5 seconds */ 236#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -218,6 +244,7 @@ static atomic_t mce_fake_paniced;
218static void wait_for_panic(void) 244static void wait_for_panic(void)
219{ 245{
220 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 246 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
247
221 preempt_disable(); 248 preempt_disable();
222 local_irq_enable(); 249 local_irq_enable();
223 while (timeout-- > 0) 250 while (timeout-- > 0)
@@ -285,6 +312,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
285static int msr_to_offset(u32 msr) 312static int msr_to_offset(u32 msr)
286{ 313{
287 unsigned bank = __get_cpu_var(injectm.bank); 314 unsigned bank = __get_cpu_var(injectm.bank);
315
288 if (msr == rip_msr) 316 if (msr == rip_msr)
289 return offsetof(struct mce, ip); 317 return offsetof(struct mce, ip);
290 if (msr == MSR_IA32_MCx_STATUS(bank)) 318 if (msr == MSR_IA32_MCx_STATUS(bank))
@@ -1108,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1108static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1109static DEFINE_PER_CPU(struct timer_list, mce_timer); 1137static DEFINE_PER_CPU(struct timer_list, mce_timer);
1110 1138
1111static void mcheck_timer(unsigned long data) 1139static void mce_start_timer(unsigned long data)
1112{ 1140{
1113 struct timer_list *t = &per_cpu(mce_timer, data); 1141 struct timer_list *t = &per_cpu(mce_timer, data);
1114 int *n; 1142 int *n;
@@ -1173,7 +1201,7 @@ int mce_notify_irq(void)
1173} 1201}
1174EXPORT_SYMBOL_GPL(mce_notify_irq); 1202EXPORT_SYMBOL_GPL(mce_notify_irq);
1175 1203
1176static int mce_banks_init(void) 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1177{ 1205{
1178 int i; 1206 int i;
1179 1207
@@ -1192,7 +1220,7 @@ static int mce_banks_init(void)
1192/* 1220/*
1193 * Initialize Machine Checks for a CPU. 1221 * Initialize Machine Checks for a CPU.
1194 */ 1222 */
1195static int __cpuinit mce_cap_init(void) 1223static int __cpuinit __mcheck_cpu_cap_init(void)
1196{ 1224{
1197 unsigned b; 1225 unsigned b;
1198 u64 cap; 1226 u64 cap;
@@ -1200,7 +1228,8 @@ static int __cpuinit mce_cap_init(void)
1200 rdmsrl(MSR_IA32_MCG_CAP, cap); 1228 rdmsrl(MSR_IA32_MCG_CAP, cap);
1201 1229
1202 b = cap & MCG_BANKCNT_MASK; 1230 b = cap & MCG_BANKCNT_MASK;
1203 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); 1231 if (!banks)
1232 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1204 1233
1205 if (b > MAX_NR_BANKS) { 1234 if (b > MAX_NR_BANKS) {
1206 printk(KERN_WARNING 1235 printk(KERN_WARNING
@@ -1213,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
1213 WARN_ON(banks != 0 && b != banks); 1242 WARN_ON(banks != 0 && b != banks);
1214 banks = b; 1243 banks = b;
1215 if (!mce_banks) { 1244 if (!mce_banks) {
1216 int err = mce_banks_init(); 1245 int err = __mcheck_cpu_mce_banks_init();
1217 1246
1218 if (err) 1247 if (err)
1219 return err; 1248 return err;
@@ -1229,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
1229 return 0; 1258 return 0;
1230} 1259}
1231 1260
1232static void mce_init(void) 1261static void __mcheck_cpu_init_generic(void)
1233{ 1262{
1234 mce_banks_t all_banks; 1263 mce_banks_t all_banks;
1235 u64 cap; 1264 u64 cap;
@@ -1258,7 +1287,7 @@ static void mce_init(void)
1258} 1287}
1259 1288
1260/* Add per CPU specific workarounds here */ 1289/* Add per CPU specific workarounds here */
1261static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1262{ 1291{
1263 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1264 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1326,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1326 return 0; 1355 return 0;
1327} 1356}
1328 1357
1329static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1330{ 1359{
1331 if (c->x86 != 5) 1360 if (c->x86 != 5)
1332 return; 1361 return;
@@ -1340,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1340 } 1369 }
1341} 1370}
1342 1371
1343static void mce_cpu_features(struct cpuinfo_x86 *c) 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1344{ 1373{
1345 switch (c->x86_vendor) { 1374 switch (c->x86_vendor) {
1346 case X86_VENDOR_INTEL: 1375 case X86_VENDOR_INTEL:
@@ -1354,18 +1383,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1354 } 1383 }
1355} 1384}
1356 1385
1357static void mce_init_timer(void) 1386static void __mcheck_cpu_init_timer(void)
1358{ 1387{
1359 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1360 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
1361 1390
1391 setup_timer(t, mce_start_timer, smp_processor_id());
1392
1362 if (mce_ignore_ce) 1393 if (mce_ignore_ce)
1363 return; 1394 return;
1364 1395
1365 *n = check_interval * HZ; 1396 *n = check_interval * HZ;
1366 if (!*n) 1397 if (!*n)
1367 return; 1398 return;
1368 setup_timer(t, mcheck_timer, smp_processor_id());
1369 t->expires = round_jiffies(jiffies + *n); 1399 t->expires = round_jiffies(jiffies + *n);
1370 add_timer_on(t, smp_processor_id()); 1400 add_timer_on(t, smp_processor_id());
1371} 1401}
@@ -1385,27 +1415,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1385 * Called for each booted CPU to set up machine checks. 1415 * Called for each booted CPU to set up machine checks.
1386 * Must be called with preempt off: 1416 * Must be called with preempt off:
1387 */ 1417 */
1388void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1418void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1389{ 1419{
1390 if (mce_disabled) 1420 if (mce_disabled)
1391 return; 1421 return;
1392 1422
1393 mce_ancient_init(c); 1423 __mcheck_cpu_ancient_init(c);
1394 1424
1395 if (!mce_available(c)) 1425 if (!mce_available(c))
1396 return; 1426 return;
1397 1427
1398 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1428 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1399 mce_disabled = 1; 1429 mce_disabled = 1;
1400 return; 1430 return;
1401 } 1431 }
1402 1432
1403 machine_check_vector = do_machine_check; 1433 machine_check_vector = do_machine_check;
1404 1434
1405 mce_init(); 1435 __mcheck_cpu_init_generic();
1406 mce_cpu_features(c); 1436 __mcheck_cpu_init_vendor(c);
1407 mce_init_timer(); 1437 __mcheck_cpu_init_timer();
1408 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1438 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1439
1409} 1440}
1410 1441
1411/* 1442/*
@@ -1625,6 +1656,15 @@ static int __init mcheck_enable(char *str)
1625} 1656}
1626__setup("mce", mcheck_enable); 1657__setup("mce", mcheck_enable);
1627 1658
1659int __init mcheck_init(void)
1660{
1661 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1662
1663 mcheck_intel_therm_init();
1664
1665 return 0;
1666}
1667
1628/* 1668/*
1629 * Sysfs support 1669 * Sysfs support
1630 */ 1670 */
@@ -1633,7 +1673,7 @@ __setup("mce", mcheck_enable);
1633 * Disable machine checks on suspend and shutdown. We can't really handle 1673 * Disable machine checks on suspend and shutdown. We can't really handle
1634 * them later. 1674 * them later.
1635 */ 1675 */
1636static int mce_disable(void) 1676static int mce_disable_error_reporting(void)
1637{ 1677{
1638 int i; 1678 int i;
1639 1679
@@ -1648,12 +1688,12 @@ static int mce_disable(void)
1648 1688
1649static int mce_suspend(struct sys_device *dev, pm_message_t state) 1689static int mce_suspend(struct sys_device *dev, pm_message_t state)
1650{ 1690{
1651 return mce_disable(); 1691 return mce_disable_error_reporting();
1652} 1692}
1653 1693
1654static int mce_shutdown(struct sys_device *dev) 1694static int mce_shutdown(struct sys_device *dev)
1655{ 1695{
1656 return mce_disable(); 1696 return mce_disable_error_reporting();
1657} 1697}
1658 1698
1659/* 1699/*
@@ -1663,8 +1703,8 @@ static int mce_shutdown(struct sys_device *dev)
1663 */ 1703 */
1664static int mce_resume(struct sys_device *dev) 1704static int mce_resume(struct sys_device *dev)
1665{ 1705{
1666 mce_init(); 1706 __mcheck_cpu_init_generic();
1667 mce_cpu_features(&current_cpu_data); 1707 __mcheck_cpu_init_vendor(&current_cpu_data);
1668 1708
1669 return 0; 1709 return 0;
1670} 1710}
@@ -1674,8 +1714,8 @@ static void mce_cpu_restart(void *data)
1674 del_timer_sync(&__get_cpu_var(mce_timer)); 1714 del_timer_sync(&__get_cpu_var(mce_timer));
1675 if (!mce_available(&current_cpu_data)) 1715 if (!mce_available(&current_cpu_data))
1676 return; 1716 return;
1677 mce_init(); 1717 __mcheck_cpu_init_generic();
1678 mce_init_timer(); 1718 __mcheck_cpu_init_timer();
1679} 1719}
1680 1720
1681/* Reinit MCEs after user configuration changes */ 1721/* Reinit MCEs after user configuration changes */
@@ -1701,7 +1741,7 @@ static void mce_enable_ce(void *all)
1701 cmci_reenable(); 1741 cmci_reenable();
1702 cmci_recheck(); 1742 cmci_recheck();
1703 if (all) 1743 if (all)
1704 mce_init_timer(); 1744 __mcheck_cpu_init_timer();
1705} 1745}
1706 1746
1707static struct sysdev_class mce_sysclass = { 1747static struct sysdev_class mce_sysclass = {
@@ -1889,7 +1929,7 @@ error2:
1889 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1929 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1890error: 1930error:
1891 while (--i >= 0) 1931 while (--i >= 0)
1892 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1932 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1893 1933
1894 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1934 sysdev_unregister(&per_cpu(mce_dev, cpu));
1895 1935
@@ -1914,13 +1954,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1914} 1954}
1915 1955
1916/* Make sure there are no machine checks on offlined CPUs. */ 1956/* Make sure there are no machine checks on offlined CPUs. */
1917static void mce_disable_cpu(void *h) 1957static void __cpuinit mce_disable_cpu(void *h)
1918{ 1958{
1919 unsigned long action = *(unsigned long *)h; 1959 unsigned long action = *(unsigned long *)h;
1920 int i; 1960 int i;
1921 1961
1922 if (!mce_available(&current_cpu_data)) 1962 if (!mce_available(&current_cpu_data))
1923 return; 1963 return;
1964
1924 if (!(action & CPU_TASKS_FROZEN)) 1965 if (!(action & CPU_TASKS_FROZEN))
1925 cmci_clear(); 1966 cmci_clear();
1926 for (i = 0; i < banks; i++) { 1967 for (i = 0; i < banks; i++) {
@@ -1931,7 +1972,7 @@ static void mce_disable_cpu(void *h)
1931 } 1972 }
1932} 1973}
1933 1974
1934static void mce_reenable_cpu(void *h) 1975static void __cpuinit mce_reenable_cpu(void *h)
1935{ 1976{
1936 unsigned long action = *(unsigned long *)h; 1977 unsigned long action = *(unsigned long *)h;
1937 int i; 1978 int i;
@@ -1976,9 +2017,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1976 break; 2017 break;
1977 case CPU_DOWN_FAILED: 2018 case CPU_DOWN_FAILED:
1978 case CPU_DOWN_FAILED_FROZEN: 2019 case CPU_DOWN_FAILED_FROZEN:
1979 t->expires = round_jiffies(jiffies + 2020 if (!mce_ignore_ce && check_interval) {
2021 t->expires = round_jiffies(jiffies +
1980 __get_cpu_var(mce_next_interval)); 2022 __get_cpu_var(mce_next_interval));
1981 add_timer_on(t, cpu); 2023 add_timer_on(t, cpu);
2024 }
1982 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2025 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1983 break; 2026 break;
1984 case CPU_POST_DEAD: 2027 case CPU_POST_DEAD:
@@ -2010,7 +2053,7 @@ static __init void mce_init_banks(void)
2010 } 2053 }
2011} 2054}
2012 2055
2013static __init int mce_init_device(void) 2056static __init int mcheck_init_device(void)
2014{ 2057{
2015 int err; 2058 int err;
2016 int i = 0; 2059 int i = 0;
@@ -2038,7 +2081,7 @@ static __init int mce_init_device(void)
2038 return err; 2081 return err;
2039} 2082}
2040 2083
2041device_initcall(mce_init_device); 2084device_initcall(mcheck_init_device);
2042 2085
2043/* 2086/*
2044 * Old style boot options parsing. Only for compatibility. 2087 * Old style boot options parsing. Only for compatibility.
@@ -2086,7 +2129,7 @@ static int fake_panic_set(void *data, u64 val)
2086DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2129DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2087 fake_panic_set, "%llu\n"); 2130 fake_panic_set, "%llu\n");
2088 2131
2089static int __init mce_debugfs_init(void) 2132static int __init mcheck_debugfs_init(void)
2090{ 2133{
2091 struct dentry *dmce, *ffake_panic; 2134 struct dentry *dmce, *ffake_panic;
2092 2135
@@ -2100,5 +2143,5 @@ static int __init mce_debugfs_init(void)
2100 2143
2101 return 0; 2144 return 0;
2102} 2145}
2103late_initcall(mce_debugfs_init); 2146late_initcall(mcheck_debugfs_init);
2104#endif 2147#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 889f665fe93d..7c785634af2b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/sched.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/processor.h> 13#include <asm/processor.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..81c499eceb21 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,14 +256,34 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259/* Thermal monitoring depends on APIC, ACPI and clock modulation */
260static int intel_thermal_supported(struct cpuinfo_x86 *c)
261{
262 if (!cpu_has_apic)
263 return 0;
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return 0;
266 return 1;
267}
268
269void __init mcheck_intel_therm_init(void)
270{
271 /*
272 * This function is only called on boot CPU. Save the init thermal
273 * LVT value on BSP and use that value to restore APs' thermal LVT
274 * entry BIOS programmed later
275 */
276 if (intel_thermal_supported(&boot_cpu_data))
277 lvtthmr_init = apic_read(APIC_LVTTHMR);
278}
279
257void intel_init_thermal(struct cpuinfo_x86 *c) 280void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 281{
259 unsigned int cpu = smp_processor_id(); 282 unsigned int cpu = smp_processor_id();
260 int tm2 = 0; 283 int tm2 = 0;
261 u32 l, h; 284 u32 l, h;
262 285
263 /* Thermal monitoring depends on ACPI and clock modulation*/ 286 if (!intel_thermal_supported(c))
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return; 287 return;
266 288
267 /* 289 /*
@@ -270,7 +292,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 292 * since it might be delivered via SMI already:
271 */ 293 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 294 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 295
296 /*
297 * The initial value of thermal LVT entries on all APs always reads
298 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
299 * sequence to them and LVT registers are reset to 0s except for
300 * the mask bits which are set to 1s when APs receive INIT IPI.
301 * Always restore the value that BIOS has programmed on AP based on
302 * BSP's info we saved since BIOS is always setting the same value
303 * for all threads/cores
304 */
305 apic_write(APIC_LVTTHMR, lvtthmr_init);
306
307 h = lvtthmr_init;
308
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 309 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 310 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 311 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
@@ -312,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
312 l = apic_read(APIC_LVTTHMR); 347 l = apic_read(APIC_LVTTHMR);
313 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 348 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
314 349
315 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 350 printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
316 cpu, tm2 ? "TM2" : "TM1"); 351 tm2 ? "TM2" : "TM1");
317 352
318 /* enable thermal throttle processing */ 353 /* enable thermal throttle processing */
319 atomic_set(&therm_throt_en, 1); 354 atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 315738c74aad..09b1698e0466 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
170 return start1 - start2; 170 return start1 - start2;
171} 171}
172 172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
173#define BIOS_BUG_MSG KERN_WARNING \ 208#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 210
@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 subtract_range(range, extra_remove_base, 258 subtract_range(range, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 259 extra_remove_base + extra_remove_size - 1);
225 260
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 261 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 262 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 263 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end)
265 continue;
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
237 range[i].start, range[i].end + 1); 267 range[i].start, range[i].end + 1);
268 }
238 } 269 }
239 270
240 /* sort the ranges */ 271 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 272 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 273 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 274 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 275 for (i = 0; i < nr_range; i++)
@@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 720 continue;
690 if (!size) 721 if (!size)
691 type = MTRR_NUM_TYPES; 722 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 723 num[type]++;
695 } 724 }
696 725
@@ -846,7 +875,7 @@ int __init mtrr_cleanup(unsigned address_bits)
846 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
847 876
848 range_sums = sum_ranges(range, nr_range); 877 range_sums = sum_ranges(range, nr_range);
849 printk(KERN_INFO "total RAM coverred: %ldM\n", 878 printk(KERN_INFO "total RAM covered: %ldM\n",
850 range_sums >> (20 - PAGE_SHIFT)); 879 range_sums >> (20 - PAGE_SHIFT));
851 880
852 if (mtrr_chunk_size && mtrr_gran_size) { 881 if (mtrr_chunk_size && mtrr_gran_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index f04e72527604..e006e56f699c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -4,6 +4,7 @@
4#include <linux/proc_fs.h> 4#include <linux/proc_fs.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h>
7#include <linux/init.h> 8#include <linux/init.h>
8 9
9#define LINE_SIZE 80 10#define LINE_SIZE 80
@@ -96,17 +97,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
96 unsigned long long base, size; 97 unsigned long long base, size;
97 char *ptr; 98 char *ptr;
98 char line[LINE_SIZE]; 99 char line[LINE_SIZE];
100 int length;
99 size_t linelen; 101 size_t linelen;
100 102
101 if (!capable(CAP_SYS_ADMIN)) 103 if (!capable(CAP_SYS_ADMIN))
102 return -EPERM; 104 return -EPERM;
103 if (!len)
104 return -EINVAL;
105 105
106 memset(line, 0, LINE_SIZE); 106 memset(line, 0, LINE_SIZE);
107 if (len > LINE_SIZE) 107
108 len = LINE_SIZE; 108 length = len;
109 if (copy_from_user(line, buf, len - 1)) 109 length--;
110
111 if (length > LINE_SIZE - 1)
112 length = LINE_SIZE - 1;
113
114 if (length < 0)
115 return -EINVAL;
116
117 if (copy_from_user(line, buf, length))
110 return -EFAULT; 118 return -EFAULT;
111 119
112 linelen = strlen(line); 120 linelen = strlen(line);
@@ -126,8 +134,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
126 return -EINVAL; 134 return -EINVAL;
127 135
128 base = simple_strtoull(line + 5, &ptr, 0); 136 base = simple_strtoull(line + 5, &ptr, 0);
129 while (isspace(*ptr)) 137 ptr = skip_spaces(ptr);
130 ptr++;
131 138
132 if (strncmp(ptr, "size=", 5)) 139 if (strncmp(ptr, "size=", 5))
133 return -EINVAL; 140 return -EINVAL;
@@ -135,14 +142,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
135 size = simple_strtoull(ptr + 5, &ptr, 0); 142 size = simple_strtoull(ptr + 5, &ptr, 0);
136 if ((base & 0xfff) || (size & 0xfff)) 143 if ((base & 0xfff) || (size & 0xfff))
137 return -EINVAL; 144 return -EINVAL;
138 while (isspace(*ptr)) 145 ptr = skip_spaces(ptr);
139 ptr++;
140 146
141 if (strncmp(ptr, "type=", 5)) 147 if (strncmp(ptr, "type=", 5))
142 return -EINVAL; 148 return -EINVAL;
143 ptr += 5; 149 ptr = skip_spaces(ptr + 5);
144 while (isspace(*ptr))
145 ptr++;
146 150
147 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 151 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
148 if (strcmp(ptr, mtrr_strings[i])) 152 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..d616c06e99b4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -102,6 +114,8 @@ struct x86_pmu {
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
155 return hw_event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
173static u64 intel_pmu_event_map(int hw_event) 228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[hw_event]; 230 return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 245 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 246 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 247
193static const u64 nehalem_hw_cache_event_ids 248static __initconst u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX] 249 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX] 250 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
281 }, 336 },
282}; 337};
283 338
284static const u64 core2_hw_cache_event_ids 339static __initconst u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX] 340 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX] 341 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
372 }, 427 },
373}; 428};
374 429
375static const u64 atom_hw_cache_event_ids 430static __initconst u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX] 431 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX] 432 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
481 return hw_event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static __initconst u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX] 540 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX] 541 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 987 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 989
990 hwc->idx = -1;
991
935 /* 992 /*
936 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
937 */ 994 */
@@ -1229,7 +1286,7 @@ x86_perf_event_set_period(struct perf_event *event,
1229 return 0; 1286 return 0;
1230 1287
1231 /* 1288 /*
1232 * If we are way outside a reasoable range then just skip forward: 1289 * If we are way outside a reasonable range then just skip forward:
1233 */ 1290 */
1234 if (unlikely(left <= -period)) { 1291 if (unlikely(left <= -period)) {
1235 left = period; 1292 left = period;
@@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1334 x86_pmu_enable_event(hwc, idx); 1391 x86_pmu_enable_event(hwc, idx);
1335} 1392}
1336 1393
1337static int 1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{ 1395{
1340 unsigned int hw_event; 1396 unsigned int hw_event;
1341 1397
@@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1349 if (!x86_pmu.num_events_fixed) 1405 if (!x86_pmu.num_events_fixed)
1350 return -1; 1406 return -1;
1351 1407
1408 /*
1409 * fixed counters do not take all possible filters
1410 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1412 return -1;
1413
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1360} 1422}
1361 1423
1362/* 1424/*
1363 * Find a PMC slot for the freshly enabled / scheduled in event: 1425 * generic counter allocator: get next free counter
1364 */ 1426 */
1365static int x86_pmu_enable(struct perf_event *event) 1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429{
1430 int idx;
1431
1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435
1436/*
1437 * intel-specific counter allocator: check event constraints
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444
1445 if (!event_constraints)
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1465{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw;
1369 int idx; 1466 int idx;
1370 1467
1371 idx = fixed_mode_idx(event, hwc); 1468 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1470 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1471 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1472 return -EAGAIN;
1376 1473
1377 hwc->config_base = 0; 1474 hwc->config_base = 0;
1378 hwc->event_base = 0; 1475 hwc->event_base = 0;
1379 hwc->idx = idx; 1476 hwc->idx = idx;
1380 } else if (idx >= 0) { 1477 } else if (idx >= 0) {
1381 /* 1478 /*
@@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event)
1396 } else { 1493 } else {
1397 idx = hwc->idx; 1494 idx = hwc->idx;
1398 /* Try to get the previous generic event again */ 1495 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1497try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_events); 1499 if (idx == -1)
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN; 1500 return -EAGAIN;
1405 1501
1406 set_bit(idx, cpuc->used_mask); 1502 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1503 hwc->idx = idx;
1408 } 1504 }
1409 hwc->config_base = x86_pmu.eventsel; 1505 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr; 1506 hwc->event_base = x86_pmu.perfctr;
1411 } 1507 }
1412 1508
1509 return idx;
1510}
1511
1512/*
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1413 perf_events_lapic_init(); 1525 perf_events_lapic_init();
1414 1526
1415 x86_pmu.disable(hwc, idx); 1527 x86_pmu.disable(hwc, idx);
@@ -1520,6 +1632,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1520 1632
1521 data.period = event->hw.last_period; 1633 data.period = event->hw.last_period;
1522 data.addr = 0; 1634 data.addr = 0;
1635 data.raw = NULL;
1523 regs.ip = 0; 1636 regs.ip = 0;
1524 1637
1525 /* 1638 /*
@@ -1637,6 +1750,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1637 u64 val; 1750 u64 val;
1638 1751
1639 data.addr = 0; 1752 data.addr = 0;
1753 data.raw = NULL;
1640 1754
1641 cpuc = &__get_cpu_var(cpu_hw_events); 1755 cpuc = &__get_cpu_var(cpu_hw_events);
1642 1756
@@ -1682,6 +1796,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1682 u64 ack, status; 1796 u64 ack, status;
1683 1797
1684 data.addr = 0; 1798 data.addr = 0;
1799 data.raw = NULL;
1685 1800
1686 cpuc = &__get_cpu_var(cpu_hw_events); 1801 cpuc = &__get_cpu_var(cpu_hw_events);
1687 1802
@@ -1745,6 +1860,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1745 u64 val; 1860 u64 val;
1746 1861
1747 data.addr = 0; 1862 data.addr = 0;
1863 data.raw = NULL;
1748 1864
1749 cpuc = &__get_cpu_var(cpu_hw_events); 1865 cpuc = &__get_cpu_var(cpu_hw_events);
1750 1866
@@ -1852,7 +1968,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1968 .priority = 1
1853}; 1969};
1854 1970
1855static struct x86_pmu p6_pmu = { 1971static __initconst struct x86_pmu p6_pmu = {
1856 .name = "p6", 1972 .name = "p6",
1857 .handle_irq = p6_pmu_handle_irq, 1973 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all, 1974 .disable_all = p6_pmu_disable_all,
@@ -1877,9 +1993,10 @@ static struct x86_pmu p6_pmu = {
1877 */ 1993 */
1878 .event_bits = 32, 1994 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1, 1995 .event_mask = (1ULL << 32) - 1,
1996 .get_event_idx = intel_get_event_idx,
1880}; 1997};
1881 1998
1882static struct x86_pmu intel_pmu = { 1999static __initconst struct x86_pmu intel_pmu = {
1883 .name = "Intel", 2000 .name = "Intel",
1884 .handle_irq = intel_pmu_handle_irq, 2001 .handle_irq = intel_pmu_handle_irq,
1885 .disable_all = intel_pmu_disable_all, 2002 .disable_all = intel_pmu_disable_all,
@@ -1900,9 +2017,10 @@ static struct x86_pmu intel_pmu = {
1900 .max_period = (1ULL << 31) - 1, 2017 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts, 2018 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts, 2019 .disable_bts = intel_pmu_disable_bts,
2020 .get_event_idx = intel_get_event_idx,
1903}; 2021};
1904 2022
1905static struct x86_pmu amd_pmu = { 2023static __initconst struct x86_pmu amd_pmu = {
1906 .name = "AMD", 2024 .name = "AMD",
1907 .handle_irq = amd_pmu_handle_irq, 2025 .handle_irq = amd_pmu_handle_irq,
1908 .disable_all = amd_pmu_disable_all, 2026 .disable_all = amd_pmu_disable_all,
@@ -1920,9 +2038,10 @@ static struct x86_pmu amd_pmu = {
1920 .apic = 1, 2038 .apic = 1,
1921 /* use highest bit to detect overflow */ 2039 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1, 2040 .max_period = (1ULL << 47) - 1,
2041 .get_event_idx = gen_get_event_idx,
1923}; 2042};
1924 2043
1925static int p6_pmu_init(void) 2044static __init int p6_pmu_init(void)
1926{ 2045{
1927 switch (boot_cpu_data.x86_model) { 2046 switch (boot_cpu_data.x86_model) {
1928 case 1: 2047 case 1:
@@ -1932,10 +2051,12 @@ static int p6_pmu_init(void)
1932 case 7: 2051 case 7:
1933 case 8: 2052 case 8:
1934 case 11: /* Pentium III */ 2053 case 11: /* Pentium III */
2054 event_constraints = intel_p6_event_constraints;
1935 break; 2055 break;
1936 case 9: 2056 case 9:
1937 case 13: 2057 case 13:
1938 /* Pentium M */ 2058 /* Pentium M */
2059 event_constraints = intel_p6_event_constraints;
1939 break; 2060 break;
1940 default: 2061 default:
1941 pr_cont("unsupported p6 CPU model %d ", 2062 pr_cont("unsupported p6 CPU model %d ",
@@ -1945,16 +2066,10 @@ static int p6_pmu_init(void)
1945 2066
1946 x86_pmu = p6_pmu; 2067 x86_pmu = p6_pmu;
1947 2068
1948 if (!cpu_has_apic) {
1949 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1950 pr_info("no hardware sampling interrupt available.\n");
1951 x86_pmu.apic = 0;
1952 }
1953
1954 return 0; 2069 return 0;
1955} 2070}
1956 2071
1957static int intel_pmu_init(void) 2072static __init int intel_pmu_init(void)
1958{ 2073{
1959 union cpuid10_edx edx; 2074 union cpuid10_edx edx;
1960 union cpuid10_eax eax; 2075 union cpuid10_eax eax;
@@ -2007,12 +2122,14 @@ static int intel_pmu_init(void)
2007 sizeof(hw_cache_event_ids)); 2122 sizeof(hw_cache_event_ids));
2008 2123
2009 pr_cont("Core2 events, "); 2124 pr_cont("Core2 events, ");
2125 event_constraints = intel_core_event_constraints;
2010 break; 2126 break;
2011 default: 2127 default:
2012 case 26: 2128 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2129 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids)); 2130 sizeof(hw_cache_event_ids));
2015 2131
2132 event_constraints = intel_nehalem_event_constraints;
2016 pr_cont("Nehalem/Corei7 events, "); 2133 pr_cont("Nehalem/Corei7 events, ");
2017 break; 2134 break;
2018 case 28: 2135 case 28:
@@ -2025,7 +2142,7 @@ static int intel_pmu_init(void)
2025 return 0; 2142 return 0;
2026} 2143}
2027 2144
2028static int amd_pmu_init(void) 2145static __init int amd_pmu_init(void)
2029{ 2146{
2030 /* Performance-monitoring supported from K7 and later: */ 2147 /* Performance-monitoring supported from K7 and later: */
2031 if (boot_cpu_data.x86 < 6) 2148 if (boot_cpu_data.x86 < 6)
@@ -2040,6 +2157,16 @@ static int amd_pmu_init(void)
2040 return 0; 2157 return 0;
2041} 2158}
2042 2159
2160static void __init pmu_check_apic(void)
2161{
2162 if (cpu_has_apic)
2163 return;
2164
2165 x86_pmu.apic = 0;
2166 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
2167 pr_info("no hardware sampling interrupt available.\n");
2168}
2169
2043void __init init_hw_perf_events(void) 2170void __init init_hw_perf_events(void)
2044{ 2171{
2045 int err; 2172 int err;
@@ -2061,6 +2188,8 @@ void __init init_hw_perf_events(void)
2061 return; 2188 return;
2062 } 2189 }
2063 2190
2191 pmu_check_apic();
2192
2064 pr_cont("%s PMU driver.\n", x86_pmu.name); 2193 pr_cont("%s PMU driver.\n", x86_pmu.name);
2065 2194
2066 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 2195 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2105,11 +2234,47 @@ static const struct pmu pmu = {
2105 .unthrottle = x86_pmu_unthrottle, 2234 .unthrottle = x86_pmu_unthrottle,
2106}; 2235};
2107 2236
2237static int
2238validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2239{
2240 struct hw_perf_event fake_event = event->hw;
2241
2242 if (event->pmu && event->pmu != &pmu)
2243 return 0;
2244
2245 return x86_schedule_event(cpuc, &fake_event) >= 0;
2246}
2247
2248static int validate_group(struct perf_event *event)
2249{
2250 struct perf_event *sibling, *leader = event->group_leader;
2251 struct cpu_hw_events fake_pmu;
2252
2253 memset(&fake_pmu, 0, sizeof(fake_pmu));
2254
2255 if (!validate_event(&fake_pmu, leader))
2256 return -ENOSPC;
2257
2258 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2259 if (!validate_event(&fake_pmu, sibling))
2260 return -ENOSPC;
2261 }
2262
2263 if (!validate_event(&fake_pmu, event))
2264 return -ENOSPC;
2265
2266 return 0;
2267}
2268
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 2269const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 2270{
2110 int err; 2271 int err;
2111 2272
2112 err = __hw_perf_event_init(event); 2273 err = __hw_perf_event_init(event);
2274 if (!err) {
2275 if (event->group_leader != event)
2276 err = validate_group(event);
2277 }
2113 if (err) { 2278 if (err) {
2114 if (event->destroy) 2279 if (event->destroy)
2115 event->destroy(event); 2280 event->destroy(event);
@@ -2132,7 +2297,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2132 2297
2133static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 2298static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2134static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 2299static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2135static DEFINE_PER_CPU(int, in_nmi_frame); 2300static DEFINE_PER_CPU(int, in_ignored_frame);
2136 2301
2137 2302
2138static void 2303static void
@@ -2148,8 +2313,9 @@ static void backtrace_warning(void *data, char *msg)
2148 2313
2149static int backtrace_stack(void *data, char *name) 2314static int backtrace_stack(void *data, char *name)
2150{ 2315{
2151 per_cpu(in_nmi_frame, smp_processor_id()) = 2316 per_cpu(in_ignored_frame, smp_processor_id()) =
2152 x86_is_stack_id(NMI_STACK, name); 2317 x86_is_stack_id(NMI_STACK, name) ||
2318 x86_is_stack_id(DEBUG_STACK, name);
2153 2319
2154 return 0; 2320 return 0;
2155} 2321}
@@ -2158,7 +2324,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2158{ 2324{
2159 struct perf_callchain_entry *entry = data; 2325 struct perf_callchain_entry *entry = data;
2160 2326
2161 if (per_cpu(in_nmi_frame, smp_processor_id())) 2327 if (per_cpu(in_ignored_frame, smp_processor_id()))
2162 return; 2328 return;
2163 2329
2164 if (reliable) 2330 if (reliable)
@@ -2170,6 +2336,7 @@ static const struct stacktrace_ops backtrace_ops = {
2170 .warning_symbol = backtrace_warning_symbol, 2336 .warning_symbol = backtrace_warning_symbol,
2171 .stack = backtrace_stack, 2337 .stack = backtrace_stack,
2172 .address = backtrace_address, 2338 .address = backtrace_address,
2339 .walk_stack = print_context_stack_bp,
2173}; 2340};
2174 2341
2175#include "../dumpstack.h" 2342#include "../dumpstack.h"
@@ -2180,7 +2347,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2180 callchain_store(entry, PERF_CONTEXT_KERNEL); 2347 callchain_store(entry, PERF_CONTEXT_KERNEL);
2181 callchain_store(entry, regs->ip); 2348 callchain_store(entry, regs->ip);
2182 2349
2183 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 2350 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2184} 2351}
2185 2352
2186/* 2353/*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..898df9719afb 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 712 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 713 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 715 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 716 return;
717 wd_ops = &k7_wd_ops; 717 wd_ops = &k7_wd_ops;
718 break; 718 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..cb27fd6136c9 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 116{
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 119
123 cpu = iminor(file->f_path.dentry->d_inode); 120 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 121 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 122 return -ENXIO; /* No such CPU */
126 goto out; 123
127 }
128 c = &cpu_data(cpu); 124 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 125 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 126 return -EIO; /* CPUID not supported */
131out: 127
132 unlock_kernel(); 128 return 0;
133 return ret;
134} 129}
135 130
136/* 131/*
@@ -192,7 +187,8 @@ static int __init cpuid_init(void)
192 int i, err = 0; 187 int i, err = 0;
193 i = 0; 188 i = 0;
194 189
195 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { 190 if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
191 "cpu/cpuid", &cpuid_fops)) {
196 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", 192 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
197 CPUID_MAJOR); 193 CPUID_MAJOR);
198 err = -EBUSY; 194 err = -EBUSY;
@@ -221,7 +217,7 @@ out_class:
221 } 217 }
222 class_destroy(cpuid_class); 218 class_destroy(cpuid_class);
223out_chrdev: 219out_chrdev:
224 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 220 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
225out: 221out:
226 return err; 222 return err;
227} 223}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..a4849c10a77e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h> 30#include <asm/x86_init.h>
31
32 31
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 33
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
106#endif 105#endif
107 106
108#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
109 pci_iommu_shutdown(); 108 x86_platform.iommu_shutdown();
110#endif 109#endif
111 110
112 crash_save_cpu(regs, safe_smp_processor_id()); 111 crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index f7cdb3b457aa..cd97ce18c29d 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -16,6 +16,22 @@ static void *kdump_buf_page;
16/* Stores the physical address of elf header of crash image. */ 16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18 18
19static inline bool is_crashed_pfn_valid(unsigned long pfn)
20{
21#ifndef CONFIG_X86_PAE
22 /*
23 * non-PAE kdump kernel executed from a PAE one will crop high pte
24 * bits and poke unwanted space counting again from address 0, we
25 * don't want that. pte must fit into unsigned long. In fact the
26 * test checks high 12 bits for being zero (pfn will be shifted left
27 * by PAGE_SHIFT).
28 */
29 return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn;
30#else
31 return true;
32#endif
33}
34
19/** 35/**
20 * copy_oldmem_page - copy one page from "oldmem" 36 * copy_oldmem_page - copy one page from "oldmem"
21 * @pfn: page frame number to be copied 37 * @pfn: page frame number to be copied
@@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
41 if (!csize) 57 if (!csize)
42 return 0; 58 return 0;
43 59
60 if (!is_crashed_pfn_valid(pfn))
61 return -EFAULT;
62
44 vaddr = kmap_atomic_pfn(pfn, KM_PTE0); 63 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
45 64
46 if (!userbuf) { 65 if (!userbuf) {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..c56bc2873030 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,6 +109,30 @@ print_context_stack(struct thread_info *tinfo,
109 } 109 }
110 return bp; 110 return bp;
111} 111}
112EXPORT_SYMBOL_GPL(print_context_stack);
113
114unsigned long
115print_context_stack_bp(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long bp,
117 const struct stacktrace_ops *ops, void *data,
118 unsigned long *end, int *graph)
119{
120 struct stack_frame *frame = (struct stack_frame *)bp;
121 unsigned long *ret_addr = &frame->return_address;
122
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr;
125
126 if (__kernel_text_address(addr)) {
127 ops->address(data, addr, 1);
128 frame = frame->next_frame;
129 ret_addr = &frame->return_address;
130 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
131 }
132 }
133 return (unsigned long)frame;
134}
135EXPORT_SYMBOL_GPL(print_context_stack_bp);
112 136
113 137
114static void 138static void
@@ -141,10 +165,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
141} 165}
142 166
143static const struct stacktrace_ops print_trace_ops = { 167static const struct stacktrace_ops print_trace_ops = {
144 .warning = print_trace_warning, 168 .warning = print_trace_warning,
145 .warning_symbol = print_trace_warning_symbol, 169 .warning_symbol = print_trace_warning_symbol,
146 .stack = print_trace_stack, 170 .stack = print_trace_stack,
147 .address = print_trace_address, 171 .address = print_trace_address,
172 .walk_stack = print_context_stack,
148}; 173};
149 174
150void 175void
@@ -188,7 +213,7 @@ void dump_stack(void)
188} 213}
189EXPORT_SYMBOL(dump_stack); 214EXPORT_SYMBOL(dump_stack);
190 215
191static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 216static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
192static int die_owner = -1; 217static int die_owner = -1;
193static unsigned int die_nest_count; 218static unsigned int die_nest_count;
194 219
@@ -207,11 +232,11 @@ unsigned __kprobes long oops_begin(void)
207 /* racy, but better than risking deadlock. */ 232 /* racy, but better than risking deadlock. */
208 raw_local_irq_save(flags); 233 raw_local_irq_save(flags);
209 cpu = smp_processor_id(); 234 cpu = smp_processor_id();
210 if (!__raw_spin_trylock(&die_lock)) { 235 if (!arch_spin_trylock(&die_lock)) {
211 if (cpu == die_owner) 236 if (cpu == die_owner)
212 /* nested oops. should stop eventually */; 237 /* nested oops. should stop eventually */;
213 else 238 else
214 __raw_spin_lock(&die_lock); 239 arch_spin_lock(&die_lock);
215 } 240 }
216 die_nest_count++; 241 die_nest_count++;
217 die_owner = cpu; 242 die_owner = cpu;
@@ -231,7 +256,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
231 die_nest_count--; 256 die_nest_count--;
232 if (!die_nest_count) 257 if (!die_nest_count)
233 /* Nest count reaches zero, release the lock. */ 258 /* Nest count reaches zero, release the lock. */
234 __raw_spin_unlock(&die_lock); 259 arch_spin_unlock(&die_lock);
235 raw_local_irq_restore(flags); 260 raw_local_irq_restore(flags);
236 oops_exit(); 261 oops_exit();
237 262
@@ -268,11 +293,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 293
269 show_registers(regs); 294 show_registers(regs);
270#ifdef CONFIG_X86_32 295#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 296 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 297 sp = regs->sp;
275 ss = regs->ss & 0xffff; 298 ss = regs->ss & 0xffff;
299 } else {
300 sp = kernel_stack_pointer(regs);
301 savesegment(ss, ss);
276 } 302 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 303 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 304 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 81086c227ab7..4fd1420faffa 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,12 +14,6 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void 17extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl); 19 unsigned long *stack, unsigned long bp, char *log_lvl);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..ae775ca47b25 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,9 +10,9 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
@@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 35
36 if (!stack) { 36 if (!stack) {
37 unsigned long dummy; 37 unsigned long dummy;
38
38 stack = &dummy; 39 stack = &dummy;
39 if (task && task != current) 40 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 41 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 58
58 context = (struct thread_info *) 59 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 60 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 61 bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 62
63 stack = (unsigned long *)context->previous_esp; 63 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 64 if (!stack)
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace);
72 72
73void 73void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 75 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 76{
77 unsigned long *stack; 77 unsigned long *stack;
78 int i; 78 int i;
@@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip)
156 156
157 return ud2 == 0x0b0f; 157 return ud2 == 0x0b0f;
158} 158}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..0ad9597073f5 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,26 +10,28 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33 35
34int x86_is_stack_id(int id, char *name) 36int x86_is_stack_id(int id, char *name)
35{ 37{
@@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name)
37} 39}
38 40
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 42 unsigned *usedp, char **idp)
41{ 43{
42 unsigned k; 44 unsigned k;
43 45
@@ -101,6 +103,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
101 return NULL; 103 return NULL;
102} 104}
103 105
106static inline int
107in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
108 unsigned long *irq_stack_end)
109{
110 return (stack >= irq_stack && stack < irq_stack_end);
111}
112
113/*
114 * We are returning from the irq stack and go to the previous one.
115 * If the previous stack is also in the irq stack, then bp in the first
116 * frame of the irq stack points to the previous, interrupted one.
117 * Otherwise we have another level of indirection: We first save
118 * the bp of the previous stack, then we switch the stack to the irq one
119 * and save a new bp that links to the previous one.
120 * (See save_args())
121 */
122static inline unsigned long
123fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
124 unsigned long *irq_stack, unsigned long *irq_stack_end)
125{
126#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp;
128
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end))
130 return (unsigned long)frame->next_frame;
131#endif
132 return bp;
133}
134
104/* 135/*
105 * x86-64 can have up to three kernel stacks: 136 * x86-64 can have up to three kernel stacks:
106 * process stack 137 * process stack
@@ -157,8 +188,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
157 if (ops->stack(data, id) < 0) 188 if (ops->stack(data, id) < 0)
158 break; 189 break;
159 190
160 bp = print_context_stack(tinfo, stack, bp, ops, 191 bp = ops->walk_stack(tinfo, stack, bp, ops,
161 data, estack_end, &graph); 192 data, estack_end, &graph);
162 ops->stack(data, "<EOE>"); 193 ops->stack(data, "<EOE>");
163 /* 194 /*
164 * We link to the next stack via the 195 * We link to the next stack via the
@@ -173,7 +204,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 irq_stack = irq_stack_end - 204 irq_stack = irq_stack_end -
174 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 205 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
175 206
176 if (stack >= irq_stack && stack < irq_stack_end) { 207 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
177 if (ops->stack(data, "IRQ") < 0) 208 if (ops->stack(data, "IRQ") < 0)
178 break; 209 break;
179 bp = print_context_stack(tinfo, stack, bp, 210 bp = print_context_stack(tinfo, stack, bp,
@@ -184,6 +215,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
184 * pointer (index -1 to end) in the IRQ stack: 215 * pointer (index -1 to end) in the IRQ stack:
185 */ 216 */
186 stack = (unsigned long *) (irq_stack_end[-1]); 217 stack = (unsigned long *) (irq_stack_end[-1]);
218 bp = fixup_bp_irq_link(bp, stack, irq_stack,
219 irq_stack_end);
187 irq_stack_end = NULL; 220 irq_stack_end = NULL;
188 ops->stack(data, "EOI"); 221 ops->stack(data, "EOI");
189 continue; 222 continue;
@@ -202,21 +235,24 @@ EXPORT_SYMBOL(dump_trace);
202 235
203void 236void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 237show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 238 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 239{
240 unsigned long *irq_stack_end;
241 unsigned long *irq_stack;
207 unsigned long *stack; 242 unsigned long *stack;
243 int cpu;
208 int i; 244 int i;
209 const int cpu = smp_processor_id(); 245
210 unsigned long *irq_stack_end = 246 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 247 cpu = smp_processor_id();
212 unsigned long *irq_stack = 248
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 249 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
250 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 251
215 /* 252 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 253 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 254 * back trace for this cpu:
218 */ 255 */
219
220 if (sp == NULL) { 256 if (sp == NULL) {
221 if (task) 257 if (task)
222 sp = (unsigned long *)task->thread.sp; 258 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +276,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 276 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 277 touch_nmi_watchdog();
242 } 278 }
279 preempt_enable();
280
243 printk("\n"); 281 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 282 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 283}
@@ -303,4 +341,3 @@ int is_valid_bugaddr(unsigned long ip)
303 341
304 return ud2 == 0x0b0f; 342 return ud2 == 0x0b0f;
305} 343}
306
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 85419bb7d4ab..05ed7ab2ca48 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -724,7 +724,7 @@ core_initcall(e820_mark_nvs_memory);
724/* 724/*
725 * Early reserved memory areas. 725 * Early reserved memory areas.
726 */ 726 */
727#define MAX_EARLY_RES 20 727#define MAX_EARLY_RES 32
728 728
729struct early_res { 729struct early_res {
730 u64 start, end; 730 u64 start, end;
@@ -732,7 +732,16 @@ struct early_res {
732 char overlap_ok; 732 char overlap_ok;
733}; 733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = { 734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 735 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
736#ifdef CONFIG_X86_32
737 /*
738 * But first pinch a few for the stack/trampoline stuff
739 * FIXME: Don't need the extra page at 4K, but need to fix
740 * trampoline before removing it. (see the GDT stuff)
741 */
742 { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 },
743#endif
744
736 {} 745 {}
737}; 746};
738 747
@@ -1378,8 +1387,8 @@ static unsigned long ram_alignment(resource_size_t pos)
1378 if (mb < 16) 1387 if (mb < 16)
1379 return 1024*1024; 1388 return 1024*1024;
1380 1389
1381 /* To 32MB for anything above that */ 1390 /* To 64MB for anything above that */
1382 return 32*1024*1024; 1391 return 64*1024*1024;
1383} 1392}
1384 1393
1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1) 1394#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 41fd965c80c6..b9c830c12b4a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -206,8 +206,11 @@ static int __init setup_early_printk(char *buf)
206 206
207 while (*buf != '\0') { 207 while (*buf != '\0') {
208 if (!strncmp(buf, "serial", 6)) { 208 if (!strncmp(buf, "serial", 6)) {
209 early_serial_init(buf + 6); 209 buf += 6;
210 early_serial_init(buf);
210 early_console_register(&early_serial_console, keep); 211 early_console_register(&early_serial_console, keep);
212 if (!strncmp(buf, ",ttyS", 5))
213 buf += 5;
211 } 214 }
212 if (!strncmp(buf, "ttyS", 4)) { 215 if (!strncmp(buf, "ttyS", 4)) {
213 early_serial_init(buf + 4); 216 early_serial_init(buf + 4);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index ad5bd988fb79..cdcfb122f256 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -454,8 +454,10 @@ void __init efi_init(void)
454 if (add_efi_memmap) 454 if (add_efi_memmap)
455 do_add_efi_memmap(); 455 do_add_efi_memmap();
456 456
457#ifdef CONFIG_X86_32
457 x86_platform.get_wallclock = efi_get_time; 458 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss; 459 x86_platform.set_wallclock = efi_set_rtc_mmss;
460#endif
459 461
460 /* Setup for EFI runtime service */ 462 /* Setup for EFI runtime service */
461 reboot_type = BOOT_EFI; 463 reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..44a8e0dc6737 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,26 +717,69 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
711 */ 727 */
712#define PTREGSCALL(name) \ 728#define PTREGSCALL0(name) \
713 ALIGN; \ 729 ALIGN; \
714ptregs_##name: \ 730ptregs_##name: \
715 leal 4(%esp),%eax; \ 731 leal 4(%esp),%eax; \
716 jmp sys_##name; 732 jmp sys_##name;
717 733
718PTREGSCALL(iopl) 734#define PTREGSCALL1(name) \
719PTREGSCALL(fork) 735 ALIGN; \
720PTREGSCALL(clone) 736ptregs_##name: \
721PTREGSCALL(vfork) 737 leal 4(%esp),%edx; \
722PTREGSCALL(execve) 738 movl (PT_EBX+4)(%esp),%eax; \
723PTREGSCALL(sigaltstack) 739 jmp sys_##name;
724PTREGSCALL(sigreturn) 740
725PTREGSCALL(rt_sigreturn) 741#define PTREGSCALL2(name) \
726PTREGSCALL(vm86) 742 ALIGN; \
727PTREGSCALL(vm86old) 743ptregs_##name: \
744 leal 4(%esp),%ecx; \
745 movl (PT_ECX+4)(%esp),%edx; \
746 movl (PT_EBX+4)(%esp),%eax; \
747 jmp sys_##name;
748
749#define PTREGSCALL3(name) \
750 ALIGN; \
751ptregs_##name: \
752 leal 4(%esp),%eax; \
753 pushl %eax; \
754 movl PT_EDX(%eax),%ecx; \
755 movl PT_ECX(%eax),%edx; \
756 movl PT_EBX(%eax),%eax; \
757 call sys_##name; \
758 addl $4,%esp; \
759 ret
760
761PTREGSCALL1(iopl)
762PTREGSCALL0(fork)
763PTREGSCALL0(vfork)
764PTREGSCALL3(execve)
765PTREGSCALL2(sigaltstack)
766PTREGSCALL0(sigreturn)
767PTREGSCALL0(rt_sigreturn)
768PTREGSCALL2(vm86)
769PTREGSCALL1(vm86old)
770
771/* Clone is an oddball. The 4th arg is in %edi */
772 ALIGN;
773ptregs_clone:
774 leal 4(%esp),%eax
775 pushl %eax
776 pushl PT_EDI(%eax)
777 movl PT_EDX(%eax),%ecx
778 movl PT_ECX(%eax),%edx
779 movl PT_EBX(%eax),%eax
780 call sys_clone
781 addl $8,%esp
782 ret
728 783
729.macro FIXUP_ESPFIX_STACK 784.macro FIXUP_ESPFIX_STACK
730/* 785/*
@@ -814,6 +869,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 869ENDPROC(common_interrupt)
815 CFI_ENDPROC 870 CFI_ENDPROC
816 871
872/*
873 * Irq entries should be protected against kprobes
874 */
875 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 876#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 877ENTRY(name) \
819 RING0_INT_FRAME; \ 878 RING0_INT_FRAME; \
@@ -980,16 +1039,16 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1039 jmp error_code
981 CFI_ENDPROC 1040 CFI_ENDPROC
982END(spurious_interrupt_bug) 1041END(spurious_interrupt_bug)
1042/*
1043 * End of kprobes section
1044 */
1045 .popsection
983 1046
984ENTRY(kernel_thread_helper) 1047ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1048 pushl $0 # fake return address for unwinder
986 CFI_STARTPROC 1049 CFI_STARTPROC
987 movl %edx,%eax 1050 movl %edi,%eax
988 push %edx 1051 call *%esi
989 CFI_ADJUST_CFA_OFFSET 4
990 call *%ebx
991 push %eax
992 CFI_ADJUST_CFA_OFFSET 4
993 call do_exit 1052 call do_exit
994 ud2 # padding for call trace 1053 ud2 # padding for call trace
995 CFI_ENDPROC 1054 CFI_ENDPROC
@@ -1185,17 +1244,14 @@ END(ftrace_graph_caller)
1185 1244
1186.globl return_to_handler 1245.globl return_to_handler
1187return_to_handler: 1246return_to_handler:
1188 pushl $0
1189 pushl %eax 1247 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1248 pushl %edx
1192 movl %ebp, %eax 1249 movl %ebp, %eax
1193 call ftrace_return_to_handler 1250 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1251 movl %eax, %ecx
1195 popl %edx 1252 popl %edx
1196 popl %ecx
1197 popl %eax 1253 popl %eax
1198 ret 1254 jmp *%ecx
1199#endif 1255#endif
1200 1256
1201.section .rodata,"a" 1257.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f358..0697ff139837 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
969#endif 977#endif
970apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
971 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
973 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
974 982
975#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
976apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1068,10 +1076,10 @@ ENTRY(\sym)
1068 TRACE_IRQS_OFF 1076 TRACE_IRQS_OFF
1069 movq %rsp,%rdi /* pt_regs pointer */ 1077 movq %rsp,%rdi /* pt_regs pointer */
1070 xorl %esi,%esi /* no error code */ 1078 xorl %esi,%esi /* no error code */
1071 PER_CPU(init_tss, %rbp) 1079 PER_CPU(init_tss, %r12)
1072 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1073 call \do_sym 1081 call \do_sym
1074 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1075 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1076 CFI_ENDPROC 1084 CFI_ENDPROC
1077END(\sym) 1085END(\sym)
@@ -1158,63 +1166,20 @@ bad_gs:
1158 jmp 2b 1166 jmp 2b
1159 .previous 1167 .previous
1160 1168
1161/* 1169ENTRY(kernel_thread_helper)
1162 * Create a kernel thread.
1163 *
1164 * C extern interface:
1165 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1166 *
1167 * asm input arguments:
1168 * rdi: fn, rsi: arg, rdx: flags
1169 */
1170ENTRY(kernel_thread)
1171 CFI_STARTPROC
1172 FAKE_STACK_FRAME $child_rip
1173 SAVE_ALL
1174
1175 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1176 movq %rdx,%rdi
1177 orq kernel_thread_flags(%rip),%rdi
1178 movq $-1, %rsi
1179 movq %rsp, %rdx
1180
1181 xorl %r8d,%r8d
1182 xorl %r9d,%r9d
1183
1184 # clone now
1185 call do_fork
1186 movq %rax,RAX(%rsp)
1187 xorl %edi,%edi
1188
1189 /*
1190 * It isn't worth to check for reschedule here,
1191 * so internally to the x86_64 port you can rely on kernel_thread()
1192 * not to reschedule the child before returning, this avoids the need
1193 * of hacks for example to fork off the per-CPU idle tasks.
1194 * [Hopefully no generic code relies on the reschedule -AK]
1195 */
1196 RESTORE_ALL
1197 UNFAKE_STACK_FRAME
1198 ret
1199 CFI_ENDPROC
1200END(kernel_thread)
1201
1202ENTRY(child_rip)
1203 pushq $0 # fake return address 1170 pushq $0 # fake return address
1204 CFI_STARTPROC 1171 CFI_STARTPROC
1205 /* 1172 /*
1206 * Here we are in the child and the registers are set as they were 1173 * Here we are in the child and the registers are set as they were
1207 * at kernel_thread() invocation in the parent. 1174 * at kernel_thread() invocation in the parent.
1208 */ 1175 */
1209 movq %rdi, %rax 1176 call *%rsi
1210 movq %rsi, %rdi
1211 call *%rax
1212 # exit 1177 # exit
1213 mov %eax, %edi 1178 mov %eax, %edi
1214 call do_exit 1179 call do_exit
1215 ud2 # padding for call trace 1180 ud2 # padding for call trace
1216 CFI_ENDPROC 1181 CFI_ENDPROC
1217END(child_rip) 1182END(kernel_thread_helper)
1218 1183
1219/* 1184/*
1220 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1491,12 +1456,17 @@ error_kernelspace:
1491 leaq irq_return(%rip),%rcx 1456 leaq irq_return(%rip),%rcx
1492 cmpq %rcx,RIP+8(%rsp) 1457 cmpq %rcx,RIP+8(%rsp)
1493 je error_swapgs 1458 je error_swapgs
1494 movl %ecx,%ecx /* zero extend */ 1459 movl %ecx,%eax /* zero extend */
1495 cmpq %rcx,RIP+8(%rsp) 1460 cmpq %rax,RIP+8(%rsp)
1496 je error_swapgs 1461 je bstep_iret
1497 cmpq $gs_change,RIP+8(%rsp) 1462 cmpq $gs_change,RIP+8(%rsp)
1498 je error_swapgs 1463 je error_swapgs
1499 jmp error_sti 1464 jmp error_sti
1465
1466bstep_iret:
1467 /* Fix truncated RIP */
1468 movq %rcx,RIP+8(%rsp)
1469 jmp error_swapgs
1500END(error_entry) 1470END(error_entry)
1501 1471
1502 1472
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..309689245431 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -187,9 +189,26 @@ static void wait_for_nmi(void)
187 nmi_wait_count++; 189 nmi_wait_count++;
188} 190}
189 191
192static inline int
193within(unsigned long addr, unsigned long start, unsigned long end)
194{
195 return addr >= start && addr < end;
196}
197
190static int 198static int
191do_ftrace_mod_code(unsigned long ip, void *new_code) 199do_ftrace_mod_code(unsigned long ip, void *new_code)
192{ 200{
201 /*
202 * On x86_64, kernel text mappings are mapped read-only with
203 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
204 * of the kernel text mapping to modify the kernel text.
205 *
206 * For 32bit kernels, these mappings are same and we can use
207 * kernel identity mapping to modify code.
208 */
209 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
210 ip = (unsigned long)__va(__pa(ip));
211
193 mod_code_ip = (void *)ip; 212 mod_code_ip = (void *)ip;
194 mod_code_newcode = new_code; 213 mod_code_newcode = new_code;
195 214
@@ -336,15 +355,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 355
337 switch (faulted) { 356 switch (faulted) {
338 case 0: 357 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 358 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 359 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 360 break;
342 case 1: 361 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 362 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 363 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 364 break;
346 case 2: 365 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 366 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 367 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 368 break;
350 } 369 }
@@ -468,82 +487,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
468 487
469#ifdef CONFIG_FTRACE_SYSCALLS 488#ifdef CONFIG_FTRACE_SYSCALLS
470 489
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table; 490extern unsigned long *sys_call_table;
474 491
475static struct syscall_metadata **syscalls_metadata; 492unsigned long __init arch_syscall_addr(int nr)
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{ 493{
526 syscalls_metadata[num]->exit_id = id; 494 return (unsigned long)(&sys_call_table)[nr];
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547} 495}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif 496#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public License
8 * as published by the Free Software Foundation.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/ioport.h>
14#include <linux/io.h>
15#include <asm/msr.h>
16#include <asm/geode.h>
17
18static struct {
19 char *name;
20 u32 msr;
21 int size;
22 u32 base;
23} lbars[] = {
24 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
25 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
26 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
27 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
28};
29
30static void __init init_lbars(void)
31{
32 u32 lo, hi;
33 int i;
34
35 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
36 rdmsr(lbars[i].msr, lo, hi);
37 if (hi & 0x01)
38 lbars[i].base = lo & 0x0000ffff;
39
40 if (lbars[i].base == 0)
41 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
42 lbars[i].name);
43 }
44}
45
46int geode_get_dev_base(unsigned int dev)
47{
48 BUG_ON(dev >= ARRAY_SIZE(lbars));
49 return lbars[dev].base;
50}
51EXPORT_SYMBOL_GPL(geode_get_dev_base);
52
53/* === GPIO API === */
54
55void geode_gpio_set(u32 gpio, unsigned int reg)
56{
57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
58
59 if (!base)
60 return;
61
62 /* low bank register */
63 if (gpio & 0xFFFF)
64 outl(gpio & 0xFFFF, base + reg);
65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
69}
70EXPORT_SYMBOL_GPL(geode_gpio_set);
71
72void geode_gpio_clear(u32 gpio, unsigned int reg)
73{
74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
75
76 if (!base)
77 return;
78
79 /* low bank register */
80 if (gpio & 0xFFFF)
81 outl((gpio & 0xFFFF) << 16, base + reg);
82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
86}
87EXPORT_SYMBOL_GPL(geode_gpio_clear);
88
89int geode_gpio_isset(u32 gpio, unsigned int reg)
90{
91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
93
94 if (!base)
95 return 0;
96
97 /* low bank register */
98 if (gpio & 0xFFFF) {
99 val = inl(base + reg) & (gpio & 0xFFFF);
100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
111}
112EXPORT_SYMBOL_GPL(geode_gpio_isset);
113
114void geode_gpio_set_irq(unsigned int group, unsigned int irq)
115{
116 u32 lo, hi;
117
118 if (group > 7 || irq > 15)
119 return;
120
121 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
122
123 lo &= ~(0xF << (group * 4));
124 lo |= (irq & 0xF) << (group * 4);
125
126 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
127}
128EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
129
130void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
131{
132 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
133 u32 offset, shift, val;
134
135 if (gpio >= 24)
136 offset = GPIO_MAP_W;
137 else if (gpio >= 16)
138 offset = GPIO_MAP_Z;
139 else if (gpio >= 8)
140 offset = GPIO_MAP_Y;
141 else
142 offset = GPIO_MAP_X;
143
144 shift = (gpio % 8) * 4;
145
146 val = inl(base + offset);
147
148 /* Clear whatever was there before */
149 val &= ~(0xF << shift);
150
151 /* And set the new value */
152
153 val |= ((pair & 7) << shift);
154
155 /* Set the PME bit if this is a PME event */
156
157 if (pme)
158 val |= (1 << (shift + 3));
159
160 outl(val, base + offset);
161}
162EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
163
164int geode_has_vsa2(void)
165{
166 static int has_vsa2 = -1;
167
168 if (has_vsa2 == -1) {
169 u16 val;
170
171 /*
172 * The VSA has virtual registers that we can query for a
173 * signature.
174 */
175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
177
178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
180 }
181
182 return has_vsa2;
183}
184EXPORT_SYMBOL_GPL(geode_has_vsa2);
185
186static int __init geode_southbridge_init(void)
187{
188 if (!is_geode())
189 return -ENODEV;
190
191 init_lbars();
192 (void) mfgpt_timer_setup();
193 return 0;
194}
195
196postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4f8e2507e8f3..5051b94c9069 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,8 +29,6 @@ static void __init i386_default_early_setup(void)
29 29
30void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
31{ 31{
32 reserve_trampoline_memory();
33
34 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
35 33
36#ifdef CONFIG_BLK_DEV_INITRD 34#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd778fd9..b5a9896ca1e7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_trampoline_memory();
102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 102
105#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fd39eaf83b84..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd928fcd5..2d8b5035371c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..ba6e65884603 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,6 +33,7 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */
36#ifdef CONFIG_PCI_MSI 37#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 38static unsigned long hpet_num_timers;
38#endif 39#endif
@@ -47,12 +48,12 @@ struct hpet_dev {
47 char name[10]; 48 char name[10];
48}; 49};
49 50
50unsigned long hpet_readl(unsigned long a) 51inline unsigned int hpet_readl(unsigned int a)
51{ 52{
52 return readl(hpet_virt_address + a); 53 return readl(hpet_virt_address + a);
53} 54}
54 55
55static inline void hpet_writel(unsigned long d, unsigned long a) 56static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 57{
57 writel(d, hpet_virt_address + a); 58 writel(d, hpet_virt_address + a);
58} 59}
@@ -167,7 +168,7 @@ do { \
167 168
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 169static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 170
170static void hpet_reserve_platform_timers(unsigned long id) 171static void hpet_reserve_platform_timers(unsigned int id)
171{ 172{
172 struct hpet __iomem *hpet = hpet_virt_address; 173 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 174 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +206,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 206
206} 207}
207#else 208#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 209static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 210#endif
210 211
211/* 212/*
@@ -246,7 +247,7 @@ static void hpet_reset_counter(void)
246 247
247static void hpet_start_counter(void) 248static void hpet_start_counter(void)
248{ 249{
249 unsigned long cfg = hpet_readl(HPET_CFG); 250 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 251 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 252 hpet_writel(cfg, HPET_CFG);
252} 253}
@@ -271,7 +272,7 @@ static void hpet_resume_counter(void)
271 272
272static void hpet_enable_legacy_int(void) 273static void hpet_enable_legacy_int(void)
273{ 274{
274 unsigned long cfg = hpet_readl(HPET_CFG); 275 unsigned int cfg = hpet_readl(HPET_CFG);
275 276
276 cfg |= HPET_CFG_LEGACY; 277 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 278 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +315,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 315static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 316 struct clock_event_device *evt, int timer)
316{ 317{
317 unsigned long cfg, cmp, now; 318 unsigned int cfg, cmp, now;
318 uint64_t delta; 319 uint64_t delta;
319 320
320 switch (mode) { 321 switch (mode) {
@@ -323,7 +324,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 324 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 325 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 326 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 327 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 328 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 329 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 330 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +340,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 340 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 341 * Publication # 24674)
341 */ 342 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 343 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 344 hpet_start_counter();
344 hpet_print_config(); 345 hpet_print_config();
345 break; 346 break;
@@ -383,13 +384,24 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 384 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 385
385 /* 386 /*
386 * We need to read back the CMP register to make sure that 387 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 388 * implementations (ATI chipsets) which seem to delay the
388 * counter. 389 * transfer of the compare register into the internal compare
390 * logic. With small deltas this might actually be too late as
391 * the counter could already be higher than the compare value
392 * at that point and we would wait for the next hpet interrupt
393 * forever. We found out that reading the CMP register back
394 * forces the transfer so we can rely on the comparison with
395 * the counter register below. If the read back from the
396 * compare register does not match the value we programmed
397 * then we might have a real hardware problem. We can not do
398 * much about it here, but at least alert the user/admin with
399 * a prominent warning.
389 */ 400 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 401 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
402 KERN_WARNING "hpet: compare register read back failed.\n");
391 403
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 404 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 405}
394 406
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 407static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +427,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 427void hpet_msi_unmask(unsigned int irq)
416{ 428{
417 struct hpet_dev *hdev = get_irq_data(irq); 429 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 430 unsigned int cfg;
419 431
420 /* unmask it */ 432 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 433 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +437,7 @@ void hpet_msi_unmask(unsigned int irq)
425 437
426void hpet_msi_mask(unsigned int irq) 438void hpet_msi_mask(unsigned int irq)
427{ 439{
428 unsigned long cfg; 440 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 441 struct hpet_dev *hdev = get_irq_data(irq);
430 442
431 /* mask it */ 443 /* mask it */
@@ -467,7 +479,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 479
468static int hpet_setup_msi_irq(unsigned int irq) 480static int hpet_setup_msi_irq(unsigned int irq)
469{ 481{
470 if (arch_setup_hpet_msi(irq)) { 482 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 483 destroy_irq(irq);
472 return -EINVAL; 484 return -EINVAL;
473 } 485 }
@@ -584,6 +596,8 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 596 unsigned int num_timers_used = 0;
585 int i; 597 int i;
586 598
599 if (boot_cpu_has(X86_FEATURE_ARAT))
600 return;
587 id = hpet_readl(HPET_ID); 601 id = hpet_readl(HPET_ID);
588 602
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 603 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +612,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 612
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 613 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 614 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 615 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 616
603 /* Only consider HPET timer with MSI support */ 617 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 618 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +827,7 @@ static int hpet_clocksource_register(void)
813 */ 827 */
814int __init hpet_enable(void) 828int __init hpet_enable(void)
815{ 829{
816 unsigned long id; 830 unsigned int id;
817 int i; 831 int i;
818 832
819 if (!is_hpet_capable()) 833 if (!is_hpet_capable())
@@ -872,10 +886,8 @@ int __init hpet_enable(void)
872 886
873 if (id & HPET_ID_LEGSUP) { 887 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 888 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 889 return 1;
877 } 890 }
878 hpet_msi_capability_lookup(0);
879 return 0; 891 return 0;
880 892
881out_nohpet: 893out_nohpet:
@@ -908,9 +920,17 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 920 if (!hpet_virt_address)
909 return -ENODEV; 921 return -ENODEV;
910 922
923 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
924 hpet_msi_capability_lookup(2);
925 else
926 hpet_msi_capability_lookup(0);
927
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 928 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 929 hpet_print_config();
913 930
931 if (boot_cpu_has(X86_FEATURE_ARAT))
932 return 0;
933
914 for_each_online_cpu(cpu) { 934 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 935 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 936 }
@@ -925,7 +945,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 945void hpet_disable(void)
926{ 946{
927 if (is_hpet_capable()) { 947 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 948 unsigned int cfg = hpet_readl(HPET_CFG);
929 949
930 if (hpet_legacy_int_enabled) { 950 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 951 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +985,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 985static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 986static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 987static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 988static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 989static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 990static unsigned long hpet_pie_limit;
971 991
972static rtc_irq_handler irq_handler; 992static rtc_irq_handler irq_handler;
@@ -1017,7 +1037,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1037 */
1018int hpet_rtc_timer_init(void) 1038int hpet_rtc_timer_init(void)
1019{ 1039{
1020 unsigned long cfg, cnt, delta, flags; 1040 unsigned int cfg, cnt, delta;
1041 unsigned long flags;
1021 1042
1022 if (!is_hpet_enabled()) 1043 if (!is_hpet_enabled())
1023 return 0; 1044 return 0;
@@ -1027,7 +1048,7 @@ int hpet_rtc_timer_init(void)
1027 1048
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1049 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1050 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1051 hpet_default_delta = clc;
1031 } 1052 }
1032 1053
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1054 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1134,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1134 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1135 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1136 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1137 hpet_pie_delta = clc;
1117 } 1138 }
1118 return 1; 1139 return 1;
1119} 1140}
@@ -1127,7 +1148,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1148
1128static void hpet_rtc_timer_reinit(void) 1149static void hpet_rtc_timer_reinit(void)
1129{ 1150{
1130 unsigned long cfg, delta; 1151 unsigned int cfg, delta;
1131 int lost_ints = -1; 1152 int lost_ints = -1;
1132 1153
1133 if (unlikely(!hpet_rtc_flags)) { 1154 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..05d5fec64a94
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,554 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type)
236{
237 /* Len */
238 switch (x86_len) {
239 case X86_BREAKPOINT_LEN_1:
240 *gen_len = HW_BREAKPOINT_LEN_1;
241 break;
242 case X86_BREAKPOINT_LEN_2:
243 *gen_len = HW_BREAKPOINT_LEN_2;
244 break;
245 case X86_BREAKPOINT_LEN_4:
246 *gen_len = HW_BREAKPOINT_LEN_4;
247 break;
248#ifdef CONFIG_X86_64
249 case X86_BREAKPOINT_LEN_8:
250 *gen_len = HW_BREAKPOINT_LEN_8;
251 break;
252#endif
253 default:
254 return -EINVAL;
255 }
256
257 /* Type */
258 switch (x86_type) {
259 case X86_BREAKPOINT_EXECUTE:
260 *gen_type = HW_BREAKPOINT_X;
261 break;
262 case X86_BREAKPOINT_WRITE:
263 *gen_type = HW_BREAKPOINT_W;
264 break;
265 case X86_BREAKPOINT_RW:
266 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
267 break;
268 default:
269 return -EINVAL;
270 }
271
272 return 0;
273}
274
275
276static int arch_build_bp_info(struct perf_event *bp)
277{
278 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
279
280 info->address = bp->attr.bp_addr;
281
282 /* Len */
283 switch (bp->attr.bp_len) {
284 case HW_BREAKPOINT_LEN_1:
285 info->len = X86_BREAKPOINT_LEN_1;
286 break;
287 case HW_BREAKPOINT_LEN_2:
288 info->len = X86_BREAKPOINT_LEN_2;
289 break;
290 case HW_BREAKPOINT_LEN_4:
291 info->len = X86_BREAKPOINT_LEN_4;
292 break;
293#ifdef CONFIG_X86_64
294 case HW_BREAKPOINT_LEN_8:
295 info->len = X86_BREAKPOINT_LEN_8;
296 break;
297#endif
298 default:
299 return -EINVAL;
300 }
301
302 /* Type */
303 switch (bp->attr.bp_type) {
304 case HW_BREAKPOINT_W:
305 info->type = X86_BREAKPOINT_WRITE;
306 break;
307 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
308 info->type = X86_BREAKPOINT_RW;
309 break;
310 case HW_BREAKPOINT_X:
311 info->type = X86_BREAKPOINT_EXECUTE;
312 break;
313 default:
314 return -EINVAL;
315 }
316
317 return 0;
318}
319/*
320 * Validate the arch-specific HW Breakpoint register settings
321 */
322int arch_validate_hwbkpt_settings(struct perf_event *bp,
323 struct task_struct *tsk)
324{
325 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
326 unsigned int align;
327 int ret;
328
329
330 ret = arch_build_bp_info(bp);
331 if (ret)
332 return ret;
333
334 ret = -EINVAL;
335
336 if (info->type == X86_BREAKPOINT_EXECUTE)
337 /*
338 * Ptrace-refactoring code
339 * For now, we'll allow instruction breakpoint only for user-space
340 * addresses
341 */
342 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
343 info->len != X86_BREAKPOINT_EXECUTE)
344 return ret;
345
346 switch (info->len) {
347 case X86_BREAKPOINT_LEN_1:
348 align = 0;
349 break;
350 case X86_BREAKPOINT_LEN_2:
351 align = 1;
352 break;
353 case X86_BREAKPOINT_LEN_4:
354 align = 3;
355 break;
356#ifdef CONFIG_X86_64
357 case X86_BREAKPOINT_LEN_8:
358 align = 7;
359 break;
360#endif
361 default:
362 return ret;
363 }
364
365 ret = arch_store_info(bp);
366
367 if (ret < 0)
368 return ret;
369 /*
370 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len.
372 */
373 if (info->address & align)
374 return -EINVAL;
375
376 /* Check that the virtual address is in the proper range */
377 if (tsk) {
378 if (!arch_check_va_in_userspace(info->address, info->len))
379 return -EFAULT;
380 } else {
381 if (!arch_check_va_in_kernelspace(info->address, info->len))
382 return -EFAULT;
383 }
384
385 return 0;
386}
387
388/*
389 * Dump the debug register contents to the user.
390 * We can't dump our per cpu values because it
391 * may contain cpu wide breakpoint, something that
392 * doesn't belong to the current task.
393 *
394 * TODO: include non-ptrace user breakpoints (perf)
395 */
396void aout_dump_debugregs(struct user *dump)
397{
398 int i;
399 int dr7 = 0;
400 struct perf_event *bp;
401 struct arch_hw_breakpoint *info;
402 struct thread_struct *thread = &current->thread;
403
404 for (i = 0; i < HBP_NUM; i++) {
405 bp = thread->ptrace_bps[i];
406
407 if (bp && !bp->attr.disabled) {
408 dump->u_debugreg[i] = bp->attr.bp_addr;
409 info = counter_arch_bp(bp);
410 dr7 |= encode_dr7(i, info->len, info->type);
411 } else {
412 dump->u_debugreg[i] = 0;
413 }
414 }
415
416 dump->u_debugreg[4] = 0;
417 dump->u_debugreg[5] = 0;
418 dump->u_debugreg[6] = current->thread.debugreg6;
419
420 dump->u_debugreg[7] = dr7;
421}
422EXPORT_SYMBOL_GPL(aout_dump_debugregs);
423
424/*
425 * Release the user breakpoints used by ptrace
426 */
427void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
428{
429 int i;
430 struct thread_struct *t = &tsk->thread;
431
432 for (i = 0; i < HBP_NUM; i++) {
433 unregister_hw_breakpoint(t->ptrace_bps[i]);
434 t->ptrace_bps[i] = NULL;
435 }
436}
437
438void hw_breakpoint_restore(void)
439{
440 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
441 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
442 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
443 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
444 set_debugreg(current->thread.debugreg6, 6);
445 set_debugreg(__get_cpu_var(cpu_dr7), 7);
446}
447EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
448
449/*
450 * Handle debug exception notifications.
451 *
452 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
453 *
454 * NOTIFY_DONE returned if one of the following conditions is true.
455 * i) When the causative address is from user-space and the exception
456 * is a valid one, i.e. not triggered as a result of lazy debug register
457 * switching
458 * ii) When there are more bits than trap<n> set in DR6 register (such
459 * as BD, BS or BT) indicating that more than one debug condition is
460 * met and requires some more action in do_debug().
461 *
462 * NOTIFY_STOP returned for all other cases
463 *
464 */
465static int __kprobes hw_breakpoint_handler(struct die_args *args)
466{
467 int i, cpu, rc = NOTIFY_STOP;
468 struct perf_event *bp;
469 unsigned long dr7, dr6;
470 unsigned long *dr6_p;
471
472 /* The DR6 value is pointed by args->err */
473 dr6_p = (unsigned long *)ERR_PTR(args->err);
474 dr6 = *dr6_p;
475
476 /* Do an early return if no trap bits are set in DR6 */
477 if ((dr6 & DR_TRAP_BITS) == 0)
478 return NOTIFY_DONE;
479
480 get_debugreg(dr7, 7);
481 /* Disable breakpoints during exception handling */
482 set_debugreg(0UL, 7);
483 /*
484 * Assert that local interrupts are disabled
485 * Reset the DRn bits in the virtualized register value.
486 * The ptrace trigger routine will add in whatever is needed.
487 */
488 current->thread.debugreg6 &= ~DR_TRAP_BITS;
489 cpu = get_cpu();
490
491 /* Handle all the breakpoints that were triggered */
492 for (i = 0; i < HBP_NUM; ++i) {
493 if (likely(!(dr6 & (DR_TRAP0 << i))))
494 continue;
495
496 /*
497 * The counter may be concurrently released but that can only
498 * occur from a call_rcu() path. We can then safely fetch
499 * the breakpoint, use its callback, touch its counter
500 * while we are in an rcu_read_lock() path.
501 */
502 rcu_read_lock();
503
504 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling
510 */
511 (*dr6_p) &= ~(DR_TRAP0 << i);
512 /*
513 * bp can be NULL due to lazy debug register switching
514 * or due to concurrent perf counter removing.
515 */
516 if (!bp) {
517 rcu_read_unlock();
518 break;
519 }
520
521 perf_bp_event(bp, args->regs);
522
523 rcu_read_unlock();
524 }
525 if (dr6 & (~DR_TRAP_BITS))
526 rc = NOTIFY_DONE;
527
528 set_debugreg(dr7, 7);
529 put_cpu();
530
531 return rc;
532}
533
534/*
535 * Handle debug exception notifications.
536 */
537int __kprobes hw_breakpoint_exceptions_notify(
538 struct notifier_block *unused, unsigned long val, void *data)
539{
540 if (val != DIE_DEBUG)
541 return NOTIFY_DONE;
542
543 return hw_breakpoint_handler(data);
544}
545
546void hw_breakpoint_pmu_read(struct perf_event *bp)
547{
548 /* TODO */
549}
550
551void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
552{
553 /* TODO */
554}
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 1736c5a725aa..9c3bd4a2050e 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -15,8 +15,10 @@ EXPORT_SYMBOL(mcount);
15 * the export, but dont use it from C code, it is used 15 * the export, but dont use it from C code, it is used
16 * by assembly code and is not using C calling convention! 16 * by assembly code and is not using C calling convention!
17 */ 17 */
18#ifndef CONFIG_X86_CMPXCHG64
18extern void cmpxchg8b_emu(void); 19extern void cmpxchg8b_emu(void);
19EXPORT_SYMBOL(cmpxchg8b_emu); 20EXPORT_SYMBOL(cmpxchg8b_emu);
21#endif
20 22
21/* Networking helper routines. */ 23/* Networking helper routines. */
22EXPORT_SYMBOL(csum_partial_copy_generic); 24EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..8eec0ec59af2 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
103 * on system-call entry - see also fork() and the signal handling 103 * on system-call entry - see also fork() and the signal handling
104 * code. 104 * code.
105 */ 105 */
106static int do_iopl(unsigned int level, struct pt_regs *regs) 106long sys_iopl(unsigned int level, struct pt_regs *regs)
107{ 107{
108 unsigned int old = (regs->flags >> 12) & 3; 108 unsigned int old = (regs->flags >> 12) & 3;
109 struct thread_struct *t = &current->thread;
109 110
110 if (level > 3) 111 if (level > 3)
111 return -EINVAL; 112 return -EINVAL;
@@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
115 return -EPERM; 116 return -EPERM;
116 } 117 }
117 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); 118 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
118
119 return 0;
120}
121
122#ifdef CONFIG_X86_32
123long sys_iopl(struct pt_regs *regs)
124{
125 unsigned int level = regs->bx;
126 struct thread_struct *t = &current->thread;
127 int rc;
128
129 rc = do_iopl(level, regs);
130 if (rc < 0)
131 goto out;
132
133 t->iopl = level << 12; 119 t->iopl = level << 12;
134 set_iopl_mask(t->iopl); 120 set_iopl_mask(t->iopl);
135out: 121
136 return rc; 122 return 0;
137}
138#else
139asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
140{
141 return do_iopl(level, regs);
142} 123}
143#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 74656d1d4e30..91fd0c70a18a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -63,19 +63,19 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT"); 66 seq_printf(p, "%*s: ", prec, "PMI");
67 for_each_online_cpu(j) 67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n"); 69 seq_printf(p, " Performance monitoring interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND"); 70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j) 71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v)
149 if (!desc) 149 if (!desc)
150 return 0; 150 return 0;
151 151
152 spin_lock_irqsave(&desc->lock, flags); 152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j) 153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j); 154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action; 155 action = desc->action;
@@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v)
170 170
171 seq_putc(p, '\n'); 171 seq_putc(p, '\n');
172out: 172out:
173 spin_unlock_irqrestore(&desc->lock, flags); 173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0; 174 return 0;
175} 175}
176 176
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
251} 251}
252 252
253/* 253/*
254 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
255 */ 255 */
256void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
257{ 257{
258 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
259 259
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
263 263
264 irq_enter(); 264 irq_enter();
265 265
266 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
267 267
268 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
269 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
270 270
271 irq_exit(); 271 irq_exit();
272 272
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
274} 274}
275 275
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 raw_spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 raw_spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 40f30773fb29..d5932226614f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -200,8 +200,8 @@ static void __init apic_intr_init(void)
200 /* self generated IPI for local APIC timer */ 200 /* self generated IPI for local APIC timer */
201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 201 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
202 202
203 /* generic IPI for platform specific use */ 203 /* IPI for X86 platform specific use */
204 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 204 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
205 205
206 /* IPI vectors for APIC spurious and error interrupts */ 206 /* IPI vectors for APIC spurious and error interrupts */
207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 207 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..dd74fe7273b1 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45 45
46#include <asm/debugreg.h>
46#include <asm/apicdef.h> 47#include <asm/apicdef.h>
47#include <asm/system.h> 48#include <asm/system.h>
48 49
@@ -85,10 +86,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
85 gdb_regs[GDB_DS] = regs->ds; 86 gdb_regs[GDB_DS] = regs->ds;
86 gdb_regs[GDB_ES] = regs->es; 87 gdb_regs[GDB_ES] = regs->es;
87 gdb_regs[GDB_CS] = regs->cs; 88 gdb_regs[GDB_CS] = regs->cs;
88 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 89 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 90 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp; 91 if (user_mode_vm(regs)) {
92 gdb_regs[GDB_SS] = regs->ss;
93 gdb_regs[GDB_SP] = regs->sp;
94 } else {
95 gdb_regs[GDB_SS] = __KERNEL_DS;
96 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
97 }
92#else 98#else
93 gdb_regs[GDB_R8] = regs->r8; 99 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 100 gdb_regs[GDB_R9] = regs->r9;
@@ -101,7 +107,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 107 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 108 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 109 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp; 110 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
105#endif 111#endif
106} 112}
107 113
@@ -220,8 +226,7 @@ static void kgdb_correct_hw_break(void)
220 dr7 |= ((breakinfo[breakno].len << 2) | 226 dr7 |= ((breakinfo[breakno].len << 2) |
221 breakinfo[breakno].type) << 227 breakinfo[breakno].type) <<
222 ((breakno << 2) + 16); 228 ((breakno << 2) + 16);
223 if (breakno >= 0 && breakno <= 3) 229 set_debugreg(breakinfo[breakno].addr, breakno);
224 set_debugreg(breakinfo[breakno].addr, breakno);
225 230
226 } else { 231 } else {
227 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 232 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
@@ -395,7 +400,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
395 /* set the trace bit if we're stepping */ 400 /* set the trace bit if we're stepping */
396 if (remcomInBuffer[0] == 's') { 401 if (remcomInBuffer[0] == 's') {
397 linux_regs->flags |= X86_EFLAGS_TF; 402 linux_regs->flags |= X86_EFLAGS_TF;
398 kgdb_single_step = 1;
399 atomic_set(&kgdb_cpu_doing_single_step, 403 atomic_set(&kgdb_cpu_doing_single_step,
400 raw_smp_processor_id()); 404 raw_smp_processor_id());
401 } 405 }
@@ -434,6 +438,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 438 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 439 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 440 args->err, "c", "", regs);
441 /*
442 * Reset the BS bit in dr6 (pointed by args->err) to
443 * denote completion of processing
444 */
445 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 446
438 return NOTIFY_STOP; 447 return NOTIFY_STOP;
439} 448}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..5b8c7505b3bc 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,22 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
51 52
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h>
59#include <asm/debugreg.h>
57 60
58void jprobe_return_end(void); 61void jprobe_return_end(void);
59 62
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 63DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 64DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 65
63#ifdef CONFIG_X86_64 66#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 67
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 68#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 97 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 99};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 100#undef W
154 101
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 102struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +191,75 @@ retry:
244 } 191 }
245} 192}
246 193
194/* Recover the probed instruction at addr for further analysis. */
195static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
196{
197 struct kprobe *kp;
198 kp = get_kprobe((void *)addr);
199 if (!kp)
200 return -EINVAL;
201
202 /*
203 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn.
208 *
209 * On the other hand, kp->opcode has a copy of the first byte of
210 * the probed instruction, which is overwritten by int3. And
211 * the instruction at kp->addr is not modified by kprobes except
212 * for the first byte, we can recover the original instruction
213 * from it and kp->opcode.
214 */
215 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
216 buf[0] = kp->opcode;
217 return 0;
218}
219
220/* Dummy buffers for kallsyms_lookup */
221static char __dummy_buf[KSYM_NAME_LEN];
222
223/* Check if paddr is at an instruction boundary */
224static int __kprobes can_probe(unsigned long paddr)
225{
226 int ret;
227 unsigned long addr, offset = 0;
228 struct insn insn;
229 kprobe_opcode_t buf[MAX_INSN_SIZE];
230
231 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
232 return 0;
233
234 /* Decode instructions */
235 addr = paddr - offset;
236 while (addr < paddr) {
237 kernel_insn_init(&insn, (void *)addr);
238 insn_get_opcode(&insn);
239
240 /*
241 * Check if the instruction has been modified by another
242 * kprobe, in which case we replace the breakpoint by the
243 * original instruction in our buffer.
244 */
245 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
246 ret = recover_probed_instruction(buf, addr);
247 if (ret)
248 /*
249 * Another debugging subsystem might insert
250 * this breakpoint. In that case, we can't
251 * recover it.
252 */
253 return 0;
254 kernel_insn_init(&insn, buf);
255 }
256 insn_get_length(&insn);
257 addr += insn.length;
258 }
259
260 return (addr == paddr);
261}
262
247/* 263/*
248 * Returns non-zero if opcode modifies the interrupt flag. 264 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 265 */
@@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
277static void __kprobes fix_riprel(struct kprobe *p) 293static void __kprobes fix_riprel(struct kprobe *p)
278{ 294{
279#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
280 u8 *insn = p->ainsn.insn; 296 struct insn insn;
281 s64 disp; 297 kernel_insn_init(&insn, p->ainsn.insn);
282 int need_modrm;
283
284 /* Skip legacy instruction prefixes. */
285 while (1) {
286 switch (*insn) {
287 case 0x66:
288 case 0x67:
289 case 0x2e:
290 case 0x3e:
291 case 0x26:
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 }
301 break;
302 }
303 298
304 /* Skip REX instruction prefix. */ 299 if (insn_rip_relative(&insn)) {
305 if (is_REX_prefix(insn)) 300 s64 newdisp;
306 ++insn; 301 u8 *disp;
307 302 insn_get_displacement(&insn);
308 if (*insn == 0x0f) { 303 /*
309 /* Two-byte opcode. */ 304 * The copied instruction uses the %rip-relative addressing
310 ++insn; 305 * mode. Adjust the displacement for the difference between
311 need_modrm = test_bit(*insn, 306 * the original location of this instruction and the location
312 (unsigned long *)twobyte_has_modrm); 307 * of the copy that will actually be run. The tricky bit here
313 } else 308 * is making sure that the sign extension happens correctly in
314 /* One-byte opcode. */ 309 * this calculation, since we need a signed 32-bit result to
315 need_modrm = test_bit(*insn, 310 * be sign-extended to 64 bits when it's added to the %rip
316 (unsigned long *)onebyte_has_modrm); 311 * value and yield the same 64-bit result that the sign-
317 312 * extension of the original signed 32-bit displacement would
318 if (need_modrm) { 313 * have given.
319 u8 modrm = *++insn; 314 */
320 if ((modrm & 0xc7) == 0x05) { 315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
321 /* %rip+disp32 addressing mode */ 316 (u8 *) p->ainsn.insn;
322 /* Displacement follows ModRM byte. */ 317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
323 ++insn; 318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
324 /* 319 *(s32 *) disp = (s32) newdisp;
325 * The copied instruction uses the %rip-relative
326 * addressing mode. Adjust the displacement for the
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 320 }
343#endif 321#endif
344} 322}
@@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 337
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 338int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 339{
340 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 342 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 343 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 344 if (!p->ainsn.insn)
@@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 452{
473 switch (kcb->kprobe_status) { 453 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 454 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 455 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb); 456 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb); 457 set_current_kprobe(p, regs, kcb);
@@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
491 kcb->kprobe_status = KPROBE_REENTER; 460 kcb->kprobe_status = KPROBE_REENTER;
492 break; 461 break;
493 case KPROBE_HIT_SS: 462 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 463 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 464 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 465 * codepath should strictly reside in .kprobes.text section.
497 return 0; 466 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 467 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 468 */
500 * to, or just after, single-stepping of a probed 469 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 470 p->addr);
502 * reside in .kprobes.text section. Raise a warning 471 dump_kprobe(p);
503 * to highlight this peculiar case. 472 BUG();
504 */
505 }
506 default: 473 default:
507 /* impossible cases */ 474 /* impossible cases */
508 WARN_ON(1); 475 WARN_ON(1);
@@ -514,7 +481,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
514 481
515/* 482/*
516 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 483 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
517 * remain disabled thorough out this function. 484 * remain disabled throughout this function.
518 */ 485 */
519static int __kprobes kprobe_handler(struct pt_regs *regs) 486static int __kprobes kprobe_handler(struct pt_regs *regs)
520{ 487{
@@ -851,7 +818,7 @@ no_change:
851 818
852/* 819/*
853 * Interrupts are disabled on entry as trap1 is an interrupt gate and they 820 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
854 * remain disabled thoroughout this function. 821 * remain disabled throughout this function.
855 */ 822 */
856static int __kprobes post_kprobe_handler(struct pt_regs *regs) 823static int __kprobes post_kprobe_handler(struct pt_regs *regs)
857{ 824{
@@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 934 ret = NOTIFY_STOP;
968 break; 935 break;
969 case DIE_DEBUG: 936 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 937 if (post_kprobe_handler(args->regs)) {
938 /*
939 * Reset the BS bit in dr6 (pointed by args->err) to
940 * denote completion of processing
941 */
942 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 943 ret = NOTIFY_STOP;
944 }
972 break; 945 break;
973 case DIE_GPF: 946 case DIE_GPF:
974 /* 947 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -157,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
157{ 158{
158 int error; 159 int error;
159 160
160 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
161 set_pages_x(image->control_code_page, 1);
162 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
163 if (error) 163 if (error)
164 return error; 164 return error;
@@ -172,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
172 */ 172 */
173void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
174{ 174{
175 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
176 set_pages_nx(image->control_code_page, 1);
177 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
178} 177}
179 178
@@ -202,6 +201,7 @@ void machine_kexec(struct kimage *image)
202 201
203 /* Interrupts aren't acceptable while we reboot */ 202 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 203 local_irq_disable();
204 hw_breakpoint_disable();
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
3 *
4 * Copyright (C) 2006, Advanced Micro Devices, Inc.
5 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of version 2 of the GNU General Public License
9 * as published by the Free Software Foundation.
10 *
11 * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
12 */
13
14/*
15 * We are using the 32.768kHz input clock - it's the only one that has the
16 * ranges we find desirable. The following table lists the suitable
17 * divisors and the associated Hz, minimum interval and the maximum interval:
18 *
19 * Divisor Hz Min Delta (s) Max Delta (s)
20 * 1 32768 .00048828125 2.000
21 * 2 16384 .0009765625 4.000
22 * 4 8192 .001953125 8.000
23 * 8 4096 .00390625 16.000
24 * 16 2048 .0078125 32.000
25 * 32 1024 .015625 64.000
26 * 64 512 .03125 128.000
27 * 128 256 .0625 256.000
28 * 256 128 .125 512.000
29 */
30
31#include <linux/kernel.h>
32#include <linux/interrupt.h>
33#include <linux/module.h>
34#include <asm/geode.h>
35
36#define MFGPT_DEFAULT_IRQ 7
37
38static struct mfgpt_timer_t {
39 unsigned int avail:1;
40} mfgpt_timers[MFGPT_MAX_TIMERS];
41
42/* Selected from the table above */
43
44#define MFGPT_DIVISOR 16
45#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
46#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
47#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
48
49/* Allow for disabling of MFGPTs */
50static int disable;
51static int __init mfgpt_disable(char *s)
52{
53 disable = 1;
54 return 1;
55}
56__setup("nomfgpt", mfgpt_disable);
57
58/* Reset the MFGPT timers. This is required by some broken BIOSes which already
59 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
60 * affected at least (0.99 is OK with MFGPT workaround left to off).
61 */
62static int __init mfgpt_fix(char *s)
63{
64 u32 val, dummy;
65
66 /* The following udocumented bit resets the MFGPT timers */
67 val = 0xFF; dummy = 0;
68 wrmsr(MSR_MFGPT_SETUP, val, dummy);
69 return 1;
70}
71__setup("mfgptfix", mfgpt_fix);
72
73/*
74 * Check whether any MFGPTs are available for the kernel to use. In most
75 * cases, firmware that uses AMD's VSA code will claim all timers during
76 * bootup; we certainly don't want to take them if they're already in use.
77 * In other cases (such as with VSAless OpenFirmware), the system firmware
78 * leaves timers available for us to use.
79 */
80
81
82static int timers = -1;
83
84static void geode_mfgpt_detect(void)
85{
86 int i;
87 u16 val;
88
89 timers = 0;
90
91 if (disable) {
92 printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
93 goto done;
94 }
95
96 if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
97 printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
98 goto done;
99 }
100
101 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
102 val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
103 if (!(val & MFGPT_SETUP_SETUP)) {
104 mfgpt_timers[i].avail = 1;
105 timers++;
106 }
107 }
108
109done:
110 printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
111}
112
113int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
114{
115 u32 msr, mask, value, dummy;
116 int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
117
118 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
119 return -EIO;
120
121 /*
122 * The register maps for these are described in sections 6.17.1.x of
123 * the AMD Geode CS5536 Companion Device Data Book.
124 */
125 switch (event) {
126 case MFGPT_EVENT_RESET:
127 /*
128 * XXX: According to the docs, we cannot reset timers above
129 * 6; that is, resets for 7 and 8 will be ignored. Is this
130 * a problem? -dilinger
131 */
132 msr = MSR_MFGPT_NR;
133 mask = 1 << (timer + 24);
134 break;
135
136 case MFGPT_EVENT_NMI:
137 msr = MSR_MFGPT_NR;
138 mask = 1 << (timer + shift);
139 break;
140
141 case MFGPT_EVENT_IRQ:
142 msr = MSR_MFGPT_IRQ;
143 mask = 1 << (timer + shift);
144 break;
145
146 default:
147 return -EIO;
148 }
149
150 rdmsr(msr, value, dummy);
151
152 if (enable)
153 value |= mask;
154 else
155 value &= ~mask;
156
157 wrmsr(msr, value, dummy);
158 return 0;
159}
160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
161
162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
163{
164 u32 zsel, lpc, dummy;
165 int shift;
166
167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
168 return -EIO;
169
170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
181 return -EIO;
182
183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
188
189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
190 if (*irq < 1 || *irq == 2 || *irq > 15)
191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
199 if (enable) {
200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
202 }
203
204 return 0;
205}
206
207static int mfgpt_get(int timer)
208{
209 mfgpt_timers[timer].avail = 0;
210 printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
211 return timer;
212}
213
214int geode_mfgpt_alloc_timer(int timer, int domain)
215{
216 int i;
217
218 if (timers == -1) {
219 /* timers haven't been detected yet */
220 geode_mfgpt_detect();
221 }
222
223 if (!timers)
224 return -1;
225
226 if (timer >= MFGPT_MAX_TIMERS)
227 return -1;
228
229 if (timer < 0) {
230 /* Try to find an available timer */
231 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
232 if (mfgpt_timers[i].avail)
233 return mfgpt_get(i);
234
235 if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
236 break;
237 }
238 } else {
239 /* If they requested a specific timer, try to honor that */
240 if (mfgpt_timers[timer].avail)
241 return mfgpt_get(timer);
242 }
243
244 /* No timers available - too bad */
245 return -1;
246}
247EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
248
249
250#ifdef CONFIG_GEODE_MFGPT_TIMER
251
252/*
253 * The MFPGT timers on the CS5536 provide us with suitable timers to use
254 * as clock event sources - not as good as a HPET or APIC, but certainly
255 * better than the PIT. This isn't a general purpose MFGPT driver, but
256 * a simplified one designed specifically to act as a clock event source.
257 * For full details about the MFGPT, please consult the CS5536 data sheet.
258 */
259
260#include <linux/clocksource.h>
261#include <linux/clockchips.h>
262
263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
264static u16 mfgpt_event_clock;
265
266static int irq;
267static int __init mfgpt_setup(char *str)
268{
269 get_option(&str, &irq);
270 return 1;
271}
272__setup("mfgpt_irq=", mfgpt_setup);
273
274static void mfgpt_disable_timer(u16 clock)
275{
276 /* avoid races by clearing CMP1 and CMP2 unconditionally */
277 geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
278 MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
279}
280
281static int mfgpt_next_event(unsigned long, struct clock_event_device *);
282static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
283
284static struct clock_event_device mfgpt_clockevent = {
285 .name = "mfgpt-timer",
286 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event,
289 .rating = 250,
290 .cpumask = cpu_all_mask,
291 .shift = 32
292};
293
294static void mfgpt_start_timer(u16 delta)
295{
296 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
297 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
298
299 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
300 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
301}
302
303static void mfgpt_set_mode(enum clock_event_mode mode,
304 struct clock_event_device *evt)
305{
306 mfgpt_disable_timer(mfgpt_event_clock);
307
308 if (mode == CLOCK_EVT_MODE_PERIODIC)
309 mfgpt_start_timer(MFGPT_PERIODIC);
310
311 mfgpt_tick_mode = mode;
312}
313
314static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
315{
316 mfgpt_start_timer(delta);
317 return 0;
318}
319
320static irqreturn_t mfgpt_tick(int irq, void *dev_id)
321{
322 u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
323
324 /* See if the interrupt was for us */
325 if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
326 return IRQ_NONE;
327
328 /* Turn off the clock (and clear the event) */
329 mfgpt_disable_timer(mfgpt_event_clock);
330
331 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
332 return IRQ_HANDLED;
333
334 /* Clear the counter */
335 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
336
337 /* Restart the clock in periodic mode */
338
339 if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
340 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
341 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
342 }
343
344 mfgpt_clockevent.event_handler(&mfgpt_clockevent);
345 return IRQ_HANDLED;
346}
347
348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer"
352};
353
354int __init mfgpt_timer_setup(void)
355{
356 int timer, ret;
357 u16 val;
358
359 timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
360 if (timer < 0) {
361 printk(KERN_ERR
362 "mfgpt-timer: Could not allocate a MFPGT timer\n");
363 return -ENODEV;
364 }
365
366 mfgpt_event_clock = timer;
367
368 /* Set up the IRQ on the MFGPT side */
369 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
370 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
371 return -EIO;
372 }
373
374 /* And register it with the kernel */
375 ret = setup_irq(irq, &mfgptirq);
376
377 if (ret) {
378 printk(KERN_ERR
379 "mfgpt-timer: Unable to set up the interrupt.\n");
380 goto err;
381 }
382
383 /* Set the clock scale and enable the event mode for CMP2 */
384 val = MFGPT_SCALE | (3 << 8);
385
386 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
387
388 /* Set up the clock event */
389 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
390 mfgpt_clockevent.shift);
391 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
392 &mfgpt_clockevent);
393 mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
394 &mfgpt_clockevent);
395
396 printk(KERN_INFO
397 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
398 timer, irq);
399 clockevents_register_device(&mfgpt_clockevent);
400
401 return 0;
402
403err:
404 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
405 printk(KERN_ERR
406 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
407 return -EIO;
408}
409
410#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 366baa179913..37542b67c57e 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
16#include <linux/firmware.h> 19#include <linux/firmware.h>
17#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
18#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -33,6 +36,9 @@ MODULE_LICENSE("GPL v2");
33#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 36#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
34#define UCODE_UCODE_TYPE 0x00000001 37#define UCODE_UCODE_TYPE 0x00000001
35 38
39const struct firmware *firmware;
40static int supported_cpu;
41
36struct equiv_cpu_entry { 42struct equiv_cpu_entry {
37 u32 installed_cpu; 43 u32 installed_cpu;
38 u32 fixed_errata_mask; 44 u32 fixed_errata_mask;
@@ -71,17 +77,14 @@ static struct equiv_cpu_entry *equiv_cpu_table;
71 77
72static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 78static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
73{ 79{
74 struct cpuinfo_x86 *c = &cpu_data(cpu);
75 u32 dummy; 80 u32 dummy;
76 81
77 memset(csig, 0, sizeof(*csig)); 82 if (!supported_cpu)
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86);
81 return -1; 83 return -1;
82 } 84
85 memset(csig, 0, sizeof(*csig));
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 88 return 0;
86} 89}
87 90
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 106 i++;
104 } 107 }
105 108
106 if (!equiv_cpu_id) { 109 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 110 return 0;
110 }
111 111
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 112 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 113 return 0;
117 }
118 114
119 /* ucode might be chipset specific -- currently we don't support this */ 115 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 117 pr_err("CPU%d: loading of chipset specific code not yet supported\n",
122 "specific code not yet supported\n", cpu); 118 cpu);
123 return 0; 119 return 0;
124 } 120 }
125 121
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
148 144
149 /* check current patch id and patch's id for match */ 145 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 146 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 148 cpu, mc_amd->hdr.patch_id);
153 return -1; 149 return -1;
154 } 150 }
155 151
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 153 uci->cpu_sig.rev = rev;
160 154
161 return 0; 155 return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 172 return NULL;
179 173
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 175 pr_err("error: invalid type field in container file section header\n");
182 "container file section header\n");
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 180
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 181 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 182 pr_err("error: size mismatch\n");
193 return NULL; 183 return NULL;
194 } 184 }
195 185
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 208 size = buf_pos[2];
219 209
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 211 pr_err("error: invalid type field in container file section header\n");
222 "container file section header\n");
223 return 0; 212 return 0;
224 } 213 }
225 214
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 216 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 217 pr_err("failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 218 return 0;
231 } 219 }
232 220
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 247
260 offset = install_equiv_cpu_table(ucode_ptr); 248 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 249 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 250 pr_err("failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 251 return UCODE_ERROR;
265 } 252 }
266 253
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
291 if (!leftover) { 278 if (!leftover) {
292 vfree(uci->mc); 279 vfree(uci->mc);
293 uci->mc = new_mc; 280 uci->mc = new_mc;
294 pr_debug("microcode: CPU%d found a matching microcode " 281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
295 "update with version 0x%x (current=0x%x)\n",
296 cpu, new_rev, uci->cpu_sig.rev); 282 cpu, new_rev, uci->cpu_sig.rev);
297 } else { 283 } else {
298 vfree(new_mc); 284 vfree(new_mc);
@@ -308,27 +294,26 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
308 294
309static enum ucode_state request_microcode_fw(int cpu, struct device *device) 295static enum ucode_state request_microcode_fw(int cpu, struct device *device)
310{ 296{
311 const char *fw_name = "amd-ucode/microcode_amd.bin";
312 const struct firmware *firmware;
313 enum ucode_state ret; 297 enum ucode_state ret;
314 298
315 if (request_firmware(&firmware, fw_name, device)) { 299 if (firmware == NULL)
316 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
317 return UCODE_NFOUND; 300 return UCODE_NFOUND;
301
302 if (*(u32 *)firmware->data != UCODE_MAGIC) {
303 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
304 *(u32 *)firmware->data);
305 return UCODE_ERROR;
318 } 306 }
319 307
320 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 308 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
321 309
322 release_firmware(firmware);
323
324 return ret; 310 return ret;
325} 311}
326 312
327static enum ucode_state 313static enum ucode_state
328request_microcode_user(int cpu, const void __user *buf, size_t size) 314request_microcode_user(int cpu, const void __user *buf, size_t size)
329{ 315{
330 printk(KERN_INFO "microcode: AMD microcode update via " 316 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
331 "/dev/cpu/microcode not supported\n");
332 return UCODE_ERROR; 317 return UCODE_ERROR;
333} 318}
334 319
@@ -340,7 +325,31 @@ static void microcode_fini_cpu_amd(int cpu)
340 uci->mc = NULL; 325 uci->mc = NULL;
341} 326}
342 327
328void init_microcode_amd(struct device *device)
329{
330 const char *fw_name = "amd-ucode/microcode_amd.bin";
331 struct cpuinfo_x86 *c = &boot_cpu_data;
332
333 WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
334
335 if (c->x86 < 0x10) {
336 pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
337 return;
338 }
339 supported_cpu = 1;
340
341 if (request_firmware(&firmware, fw_name, device))
342 pr_err("failed to load file %s\n", fw_name);
343}
344
345void fini_microcode_amd(void)
346{
347 release_firmware(firmware);
348}
349
343static struct microcode_ops microcode_amd_ops = { 350static struct microcode_ops microcode_amd_ops = {
351 .init = init_microcode_amd,
352 .fini = fini_microcode_amd,
344 .request_microcode_user = request_microcode_user, 353 .request_microcode_user = request_microcode_user,
345 .request_microcode_fw = request_microcode_fw, 354 .request_microcode_fw = request_microcode_fw,
346 .collect_cpu_info = collect_cpu_info_amd, 355 .collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..0c8632433090 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,10 +70,12 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/platform_device.h> 76#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 77#include <linux/miscdevice.h>
75#include <linux/capability.h> 78#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 79#include <linux/kernel.h>
78#include <linux/module.h> 80#include <linux/module.h>
79#include <linux/mutex.h> 81#include <linux/mutex.h>
@@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 203
202static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 205{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 207}
207 208
@@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
211 ssize_t ret = -EINVAL; 212 ssize_t ret = -EINVAL;
212 213
213 if ((len >> PAGE_SHIFT) > totalram_pages) { 214 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); 215 pr_err("too much data (max %ld pages)\n", totalram_pages);
215 return ret; 216 return ret;
216 } 217 }
217 218
@@ -246,7 +247,7 @@ static int __init microcode_dev_init(void)
246 247
247 error = misc_register(&microcode_dev); 248 error = misc_register(&microcode_dev);
248 if (error) { 249 if (error) {
249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); 250 pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
250 return error; 251 return error;
251 } 252 }
252 253
@@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
361 if (!uci->mc) 362 if (!uci->mc)
362 return UCODE_NFOUND; 363 return UCODE_NFOUND;
363 364
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu); 365 pr_debug("CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu); 366 apply_microcode_on_target(cpu);
366 367
367 return UCODE_OK; 368 return UCODE_OK;
@@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 382 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
382 383
383 if (ustate == UCODE_OK) { 384 if (ustate == UCODE_OK) {
384 pr_debug("microcode: CPU%d updated upon init\n", cpu); 385 pr_debug("CPU%d updated upon init\n", cpu);
385 apply_microcode_on_target(cpu); 386 apply_microcode_on_target(cpu);
386 } 387 }
387 388
@@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
408 if (!cpu_online(cpu)) 409 if (!cpu_online(cpu))
409 return 0; 410 return 0;
410 411
411 pr_debug("microcode: CPU%d added\n", cpu); 412 pr_debug("CPU%d added\n", cpu);
412 413
413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 414 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
414 if (err) 415 if (err)
@@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
427 if (!cpu_online(cpu)) 428 if (!cpu_online(cpu))
428 return 0; 429 return 0;
429 430
430 pr_debug("microcode: CPU%d removed\n", cpu); 431 pr_debug("CPU%d removed\n", cpu);
431 microcode_fini_cpu(cpu); 432 microcode_fini_cpu(cpu);
432 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 433 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
433 return 0; 434 return 0;
@@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
475 microcode_update_cpu(cpu); 476 microcode_update_cpu(cpu);
476 case CPU_DOWN_FAILED: 477 case CPU_DOWN_FAILED:
477 case CPU_DOWN_FAILED_FROZEN: 478 case CPU_DOWN_FAILED_FROZEN:
478 pr_debug("microcode: CPU%d added\n", cpu); 479 pr_debug("CPU%d added\n", cpu);
479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 480 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
480 pr_err("microcode: Failed to create group for CPU%d\n", cpu); 481 pr_err("Failed to create group for CPU%d\n", cpu);
481 break; 482 break;
482 case CPU_DOWN_PREPARE: 483 case CPU_DOWN_PREPARE:
483 case CPU_DOWN_PREPARE_FROZEN: 484 case CPU_DOWN_PREPARE_FROZEN:
484 /* Suspend is in progress, only remove the interface */ 485 /* Suspend is in progress, only remove the interface */
485 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 486 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
486 pr_debug("microcode: CPU%d removed\n", cpu); 487 pr_debug("CPU%d removed\n", cpu);
487 break; 488 break;
488 case CPU_DEAD: 489 case CPU_DEAD:
489 case CPU_UP_CANCELED_FROZEN: 490 case CPU_UP_CANCELED_FROZEN:
@@ -509,7 +510,7 @@ static int __init microcode_init(void)
509 microcode_ops = init_amd_microcode(); 510 microcode_ops = init_amd_microcode();
510 511
511 if (!microcode_ops) { 512 if (!microcode_ops) {
512 pr_err("microcode: no support for this CPU vendor\n"); 513 pr_err("no support for this CPU vendor\n");
513 return -ENODEV; 514 return -ENODEV;
514 } 515 }
515 516
@@ -520,6 +521,9 @@ static int __init microcode_init(void)
520 return PTR_ERR(microcode_pdev); 521 return PTR_ERR(microcode_pdev);
521 } 522 }
522 523
524 if (microcode_ops->init)
525 microcode_ops->init(&microcode_pdev->dev);
526
523 get_online_cpus(); 527 get_online_cpus();
524 mutex_lock(&microcode_mutex); 528 mutex_lock(&microcode_mutex);
525 529
@@ -540,8 +544,7 @@ static int __init microcode_init(void)
540 register_hotcpu_notifier(&mc_cpu_notifier); 544 register_hotcpu_notifier(&mc_cpu_notifier);
541 545
542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 546 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
543 " <tigran@aivazian.fsnet.co.uk>," 547 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
544 " Peter Oruba\n");
545 548
546 return 0; 549 return 0;
547} 550}
@@ -563,6 +566,9 @@ static void __exit microcode_exit(void)
563 566
564 platform_device_unregister(microcode_pdev); 567 platform_device_unregister(microcode_pdev);
565 568
569 if (microcode_ops->fini)
570 microcode_ops->fini();
571
566 microcode_ops = NULL; 572 microcode_ops = NULL;
567 573
568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 574 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..ebd193e476ca 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/firmware.h> 76#include <linux/firmware.h>
74#include <linux/uaccess.h> 77#include <linux/uaccess.h>
75#include <linux/kernel.h> 78#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
146 149
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) { 151 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel " 152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
150 "processor\n", cpu_num);
151 return -1; 153 return -1;
152 } 154 }
153 155
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
165 /* get the current revision from MSR 0x8B */ 167 /* get the current revision from MSR 0x8B */
166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
167 169
168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
169 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
170 172
171 return 0; 173 return 0;
172} 174}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
194 data_size = get_datasize(mc_header); 196 data_size = get_datasize(mc_header);
195 197
196 if (data_size + MC_HEADER_SIZE > total_size) { 198 if (data_size + MC_HEADER_SIZE > total_size) {
197 printk(KERN_ERR "microcode: error! " 199 pr_err("error! Bad data size in microcode data file\n");
198 "Bad data size in microcode data file\n");
199 return -EINVAL; 200 return -EINVAL;
200 } 201 }
201 202
202 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 203 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
203 printk(KERN_ERR "microcode: error! " 204 pr_err("error! Unknown microcode update format\n");
204 "Unknown microcode update format\n");
205 return -EINVAL; 205 return -EINVAL;
206 } 206 }
207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
208 if (ext_table_size) { 208 if (ext_table_size) {
209 if ((ext_table_size < EXT_HEADER_SIZE) 209 if ((ext_table_size < EXT_HEADER_SIZE)
210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
211 printk(KERN_ERR "microcode: error! " 211 pr_err("error! Small exttable size in microcode data file\n");
212 "Small exttable size in microcode data file\n");
213 return -EINVAL; 212 return -EINVAL;
214 } 213 }
215 ext_header = mc + MC_HEADER_SIZE + data_size; 214 ext_header = mc + MC_HEADER_SIZE + data_size;
216 if (ext_table_size != exttable_size(ext_header)) { 215 if (ext_table_size != exttable_size(ext_header)) {
217 printk(KERN_ERR "microcode: error! " 216 pr_err("error! Bad exttable size in microcode data file\n");
218 "Bad exttable size in microcode data file\n");
219 return -EFAULT; 217 return -EFAULT;
220 } 218 }
221 ext_sigcount = ext_header->count; 219 ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
230 while (i--) 228 while (i--)
231 ext_table_sum += ext_tablep[i]; 229 ext_table_sum += ext_tablep[i];
232 if (ext_table_sum) { 230 if (ext_table_sum) {
233 printk(KERN_WARNING "microcode: aborting, " 231 pr_warning("aborting, bad extended signature table checksum\n");
234 "bad extended signature table checksum\n");
235 return -EINVAL; 232 return -EINVAL;
236 } 233 }
237 } 234 }
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
242 while (i--) 239 while (i--)
243 orig_sum += ((int *)mc)[i]; 240 orig_sum += ((int *)mc)[i];
244 if (orig_sum) { 241 if (orig_sum) {
245 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 242 pr_err("aborting, bad checksum\n");
246 return -EINVAL; 243 return -EINVAL;
247 } 244 }
248 if (!ext_table_size) 245 if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
255 - (mc_header->sig + mc_header->pf + mc_header->cksum) 252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
256 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
257 if (sum) { 254 if (sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 255 pr_err("aborting, bad checksum\n");
259 return -EINVAL; 256 return -EINVAL;
260 } 257 }
261 } 258 }
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 324 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
328 325
329 if (val[1] != mc_intel->hdr.rev) { 326 if (val[1] != mc_intel->hdr.rev) {
330 printk(KERN_ERR "microcode: CPU%d update " 327 pr_err("CPU%d update to revision 0x%x failed\n",
331 "to revision 0x%x failed\n", 328 cpu_num, mc_intel->hdr.rev);
332 cpu_num, mc_intel->hdr.rev);
333 return -1; 329 return -1;
334 } 330 }
335 printk(KERN_INFO "microcode: CPU%d updated to revision " 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n",
336 "0x%x, date = %04x-%02x-%02x \n",
337 cpu_num, val[1], 332 cpu_num, val[1],
338 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
339 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 357
363 mc_size = get_totalsize(&mc_header); 358 mc_size = get_totalsize(&mc_header);
364 if (!mc_size || mc_size > leftover) { 359 if (!mc_size || mc_size > leftover) {
365 printk(KERN_ERR "microcode: error!" 360 pr_err("error! Bad data in microcode data file\n");
366 "Bad data in microcode data file\n");
367 break; 361 break;
368 } 362 }
369 363
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 vfree(uci->mc); 399 vfree(uci->mc);
406 uci->mc = (struct microcode_intel *)new_mc; 400 uci->mc = (struct microcode_intel *)new_mc;
407 401
408 pr_debug("microcode: CPU%d found a matching microcode update with" 402 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
409 " version 0x%x (current=0x%x)\n", 403 cpu, new_rev, uci->cpu_sig.rev);
410 cpu, new_rev, uci->cpu_sig.rev);
411out: 404out:
412 return state; 405 return state;
413} 406}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
429 c->x86, c->x86_model, c->x86_mask); 422 c->x86, c->x86_model, c->x86_mask);
430 423
431 if (request_firmware(&firmware, name, device)) { 424 if (request_firmware(&firmware, name, device)) {
432 pr_debug("microcode: data file %s load failed\n", name); 425 pr_debug("data file %s load failed\n", name);
433 return UCODE_NFOUND; 426 return UCODE_NFOUND;
434 } 427 }
435 428
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..40b54ceb68b5 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 667 */
668} 668}
669 669
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 671{
672 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 673
685 if (mpf->physptr < end) { 674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 675}
694 676
695static int __init smp_scan_config(unsigned long base, unsigned long length, 677static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 678{
698 unsigned int *bp = phys_to_virt(base); 679 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 680 struct mpf_intel *mpf;
681 unsigned long mem;
700 682
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 683 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 684 bp, length);
@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 699 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 700 mpf, (u64)virt_to_phys(mpf));
719 701
720 if (!reserve) 702 mem = virt_to_phys(mpf);
721 return 1; 703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 704 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 705 smp_reserve_memory(mpf);
726 706
727 return 1; 707 return 1;
728 } 708 }
@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 712 return 0;
733} 713}
734 714
735void __init default_find_smp_config(unsigned int reserve) 715void __init default_find_smp_config(void)
736{ 716{
737 unsigned int address; 717 unsigned int address;
738 718
@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 724 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 725 * 3) Scan the 64K of bios
746 */ 726 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 727 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 728 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 729 smp_scan_config(0xF0000, 0x10000))
750 return; 730 return;
751 /* 731 /*
752 * If it is an SMP machine we should know now, unless the 732 * If it is an SMP machine we should know now, unless the
@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 747
768 address = get_bios_ebda(); 748 address = get_bios_ebda();
769 if (address) 749 if (address)
770 smp_scan_config(address, 0x400, reserve); 750 smp_scan_config(address, 0x400);
771} 751}
772 752
773#ifdef CONFIG_X86_IO_APIC 753#ifdef CONFIG_X86_IO_APIC
@@ -965,9 +945,6 @@ void __init early_reserve_e820_mpc_new(void)
965{ 945{
966 if (enable_update_mptable && alloc_mptable) { 946 if (enable_update_mptable && alloc_mptable) {
967 u64 startt = 0; 947 u64 startt = 0;
968#ifdef CONFIG_X86_TRAMPOLINE
969 startt = TRAMPOLINE_BASE;
970#endif
971 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); 948 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
972 } 949 }
973} 950}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..4bd93c9b2b27 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -172,23 +172,18 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
172 172
173static int msr_open(struct inode *inode, struct file *file) 173static int msr_open(struct inode *inode, struct file *file)
174{ 174{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu;
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 176 struct cpuinfo_x86 *c;
177 int ret = 0;
178 177
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 178 cpu = iminor(file->f_path.dentry->d_inode);
179 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
180 return -ENXIO; /* No such CPU */
181 181
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
183 ret = -ENXIO; /* No such CPU */
184 goto out;
185 }
186 c = &cpu_data(cpu); 182 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 183 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 184 return -EIO; /* MSR not supported */
189out: 185
190 unlock_kernel(); 186 return 0;
191 return ret;
192} 187}
193 188
194/* 189/*
@@ -251,7 +246,7 @@ static int __init msr_init(void)
251 int i, err = 0; 246 int i, err = 0;
252 i = 0; 247 i = 0;
253 248
254 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { 249 if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
255 printk(KERN_ERR "msr: unable to get major %d for msr\n", 250 printk(KERN_ERR "msr: unable to get major %d for msr\n",
256 MSR_MAJOR); 251 MSR_MAJOR);
257 err = -EBUSY; 252 err = -EBUSY;
@@ -279,7 +274,7 @@ out_class:
279 msr_device_destroy(i); 274 msr_device_destroy(i);
280 class_destroy(msr_class); 275 class_destroy(msr_class);
281out_chrdev: 276out_chrdev:
282 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 277 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
283out: 278out:
284 return err; 279 return err;
285} 280}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 4006c522adc7..9d1d263f786f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -212,7 +212,7 @@ static int __init olpc_init(void)
212 unsigned char *romsig; 212 unsigned char *romsig;
213 213
214 /* The ioremap check is dangerous; limit what we run it on */ 214 /* The ioremap check is dangerous; limit what we run it on */
215 if (!is_geode() || geode_has_vsa2()) 215 if (!is_geode() || cs5535_has_vsa2())
216 return 0; 216 return 0;
217 217
218 spin_lock_init(&ec_lock); 218 spin_lock_init(&ec_lock);
@@ -244,7 +244,7 @@ static int __init olpc_init(void)
244 (unsigned char *) &olpc_platform_info.ecver, 1); 244 (unsigned char *) &olpc_platform_info.ecver, 1);
245 245
246 /* check to see if the VSA exists */ 246 /* check to see if the VSA exists */
247 if (geode_has_vsa2()) 247 if (cs5535_has_vsa2())
248 olpc_platform_info.flags |= OLPC_F_VSA; 248 olpc_platform_info.flags |= OLPC_F_VSA;
249 249
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..676b8c77a976 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,9 +8,9 @@
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static inline void 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) 11default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
12{ 12{
13 __raw_spin_lock(lock); 13 arch_spin_lock(lock);
14} 14}
15 15
16struct pv_lock_ops pv_lock_ops = { 16struct pv_lock_ops pv_lock_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..2bbde6078143 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h> 32#include <linux/crash_dump.h>
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include <linux/bitops.h> 34#include <linux/bitmap.h>
35#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
36#include <linux/pci.h> 36#include <linux/pci.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -211,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
211 212
212 spin_lock_irqsave(&tbl->it_lock, flags); 213 spin_lock_irqsave(&tbl->it_lock, flags);
213 214
214 iommu_area_reserve(tbl->it_map, index, npages); 215 bitmap_set(tbl->it_map, index, npages);
215 216
216 spin_unlock_irqrestore(&tbl->it_lock, flags); 217 spin_unlock_irqrestore(&tbl->it_lock, flags);
217} 218}
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -305,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
305 303
306 spin_lock_irqsave(&tbl->it_lock, flags); 304 spin_lock_irqsave(&tbl->it_lock, flags);
307 305
308 iommu_area_free(tbl->it_map, entry, npages); 306 bitmap_clear(tbl->it_map, entry, npages);
309 307
310 spin_unlock_irqrestore(&tbl->it_lock, flags); 308 spin_unlock_irqrestore(&tbl->it_lock, flags);
311} 309}
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 64b838eac18c..75e14e21f61a 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
11#include <asm/gart.h> 11#include <asm/gart.h>
12#include <asm/calgary.h> 12#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 13#include <asm/amd_iommu.h>
14#include <asm/x86_init.h>
14 15
15static int forbid_dac __read_mostly; 16static int forbid_dac __read_mostly;
16 17
17struct dma_map_ops *dma_ops; 18struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 19EXPORT_SYMBOL(dma_ops);
19 20
20static int iommu_sac_force __read_mostly; 21static int iommu_sac_force __read_mostly;
@@ -35,22 +36,17 @@ int iommu_detected __read_mostly = 0;
35 36
36/* 37/*
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 38 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA ranslation for 39 * If this variable is 1, IOMMU implementations do no DMA translation for
39 * devices and allow every device to access to whole physical memory. This is 40 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to 41 * useful if a user want to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation. 42 * guests and not for driver dma translation.
42 */ 43 */
43int iommu_pass_through __read_mostly; 44int iommu_pass_through __read_mostly;
44 45
45dma_addr_t bad_dma_address __read_mostly = 0; 46/* Dummy device used for NULL arguments (normally ISA). */
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). Better would
49 be probably a smaller DMA mask, but this is bug-to-bug compatible
50 to older i386. */
51struct device x86_dma_fallback_dev = { 47struct device x86_dma_fallback_dev = {
52 .init_name = "fallback device", 48 .init_name = "fallback device",
53 .coherent_dma_mask = DMA_BIT_MASK(32), 49 .coherent_dma_mask = ISA_DMA_BIT_MASK,
54 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, 50 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
55}; 51};
56EXPORT_SYMBOL(x86_dma_fallback_dev); 52EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -128,19 +124,18 @@ void __init pci_iommu_alloc(void)
128 /* free the range so iommu could get some range less than 4G */ 124 /* free the range so iommu could get some range less than 4G */
129 dma32_free_bootmem(); 125 dma32_free_bootmem();
130#endif 126#endif
127 if (pci_swiotlb_detect())
128 goto out;
131 129
132 /*
133 * The order of these functions is important for
134 * fall-back/fail-over reasons
135 */
136 gart_iommu_hole_init(); 130 gart_iommu_hole_init();
137 131
138 detect_calgary(); 132 detect_calgary();
139 133
140 detect_intel_iommu(); 134 detect_intel_iommu();
141 135
136 /* needs to be called after gart_iommu_hole_init */
142 amd_iommu_detect(); 137 amd_iommu_detect();
143 138out:
144 pci_swiotlb_init(); 139 pci_swiotlb_init();
145} 140}
146 141
@@ -216,7 +211,7 @@ static __init int iommu_setup(char *p)
216 if (!strncmp(p, "allowdac", 8)) 211 if (!strncmp(p, "allowdac", 8))
217 forbid_dac = 0; 212 forbid_dac = 0;
218 if (!strncmp(p, "nodac", 5)) 213 if (!strncmp(p, "nodac", 5))
219 forbid_dac = -1; 214 forbid_dac = 1;
220 if (!strncmp(p, "usedac", 6)) { 215 if (!strncmp(p, "usedac", 6)) {
221 forbid_dac = -1; 216 forbid_dac = -1;
222 return 1; 217 return 1;
@@ -291,27 +286,19 @@ static int __init pci_iommu_init(void)
291#ifdef CONFIG_PCI 286#ifdef CONFIG_PCI
292 dma_debug_add_bus(&pci_bus_type); 287 dma_debug_add_bus(&pci_bus_type);
293#endif 288#endif
289 x86_init.iommu.iommu_init();
294 290
295 calgary_iommu_init(); 291 if (swiotlb) {
296 292 printk(KERN_INFO "PCI-DMA: "
297 intel_iommu_init(); 293 "Using software bounce buffering for IO (SWIOTLB)\n");
294 swiotlb_print_info();
295 } else
296 swiotlb_free();
298 297
299 amd_iommu_init();
300
301 gart_iommu_init();
302
303 no_iommu_init();
304 return 0; 298 return 0;
305} 299}
306
307void pci_iommu_shutdown(void)
308{
309 gart_iommu_shutdown();
310
311 amd_iommu_shutdown();
312}
313/* Must execute after PCI subsystem */ 300/* Must execute after PCI subsystem */
314fs_initcall(pci_iommu_init); 301rootfs_initcall(pci_iommu_init);
315 302
316#ifdef CONFIG_PCI 303#ifdef CONFIG_PCI
317/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 304/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed7..34de53b46f87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,13 +16,14 @@
16#include <linux/agp_backend.h> 16#include <linux/agp_backend.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/sched.h>
19#include <linux/string.h> 20#include <linux/string.h>
20#include <linux/spinlock.h> 21#include <linux/spinlock.h>
21#include <linux/pci.h> 22#include <linux/pci.h>
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/topology.h> 24#include <linux/topology.h>
24#include <linux/interrupt.h> 25#include <linux/interrupt.h>
25#include <linux/bitops.h> 26#include <linux/bitmap.h>
26#include <linux/kdebug.h> 27#include <linux/kdebug.h>
27#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
@@ -38,6 +39,7 @@
38#include <asm/swiotlb.h> 39#include <asm/swiotlb.h>
39#include <asm/dma.h> 40#include <asm/dma.h>
40#include <asm/k8.h> 41#include <asm/k8.h>
42#include <asm/x86_init.h>
41 43
42static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 44static unsigned long iommu_bus_base; /* GART remapping area (physical) */
43static unsigned long iommu_size; /* size of remapping area bytes */ 45static unsigned long iommu_size; /* size of remapping area bytes */
@@ -45,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
45 47
46static u32 *iommu_gatt_base; /* Remapping table */ 48static u32 *iommu_gatt_base; /* Remapping table */
47 49
50static dma_addr_t bad_dma_addr;
51
48/* 52/*
49 * If this is disabled the IOMMU will use an optimized flushing strategy 53 * If this is disabled the IOMMU will use an optimized flushing strategy
50 * of only flushing when an mapping is reused. With it true the GART is 54 * of only flushing when an mapping is reused. With it true the GART is
@@ -91,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
91 95
92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 96 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
93 PAGE_SIZE) >> PAGE_SHIFT; 97 PAGE_SIZE) >> PAGE_SHIFT;
94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 98 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
95 PAGE_SIZE) >> PAGE_SHIFT; 99 PAGE_SIZE) >> PAGE_SHIFT;
96 100
97 spin_lock_irqsave(&iommu_bitmap_lock, flags); 101 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -122,7 +126,7 @@ static void free_iommu(unsigned long offset, int size)
122 unsigned long flags; 126 unsigned long flags;
123 127
124 spin_lock_irqsave(&iommu_bitmap_lock, flags); 128 spin_lock_irqsave(&iommu_bitmap_lock, flags);
125 iommu_area_free(iommu_gart_bitmap, offset, size); 129 bitmap_clear(iommu_gart_bitmap, offset, size);
126 if (offset >= next_bit) 130 if (offset >= next_bit)
127 next_bit = offset + size; 131 next_bit = offset + size;
128 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 132 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -215,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
215 if (panic_on_overflow) 219 if (panic_on_overflow)
216 panic("dma_map_area overflow %lu bytes\n", size); 220 panic("dma_map_area overflow %lu bytes\n", size);
217 iommu_full(dev, size, dir); 221 iommu_full(dev, size, dir);
218 return bad_dma_address; 222 return bad_dma_addr;
219 } 223 }
220 224
221 for (i = 0; i < npages; i++) { 225 for (i = 0; i < npages; i++) {
@@ -293,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
293 int i; 297 int i;
294 298
295#ifdef CONFIG_IOMMU_DEBUG 299#ifdef CONFIG_IOMMU_DEBUG
296 printk(KERN_DEBUG "dma_map_sg overflow\n"); 300 pr_debug("dma_map_sg overflow\n");
297#endif 301#endif
298 302
299 for_each_sg(sg, s, nents, i) { 303 for_each_sg(sg, s, nents, i) {
@@ -301,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
301 305
302 if (nonforced_iommu(dev, addr, s->length)) { 306 if (nonforced_iommu(dev, addr, s->length)) {
303 addr = dma_map_area(dev, addr, s->length, dir, 0); 307 addr = dma_map_area(dev, addr, s->length, dir, 0);
304 if (addr == bad_dma_address) { 308 if (addr == bad_dma_addr) {
305 if (i > 0) 309 if (i > 0)
306 gart_unmap_sg(dev, sg, i, dir, NULL); 310 gart_unmap_sg(dev, sg, i, dir, NULL);
307 nents = 0; 311 nents = 0;
@@ -388,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
388 if (!dev) 392 if (!dev)
389 dev = &x86_dma_fallback_dev; 393 dev = &x86_dma_fallback_dev;
390 394
391 out = 0; 395 out = 0;
392 start = 0; 396 start = 0;
393 start_sg = sgmap = sg; 397 start_sg = sg;
394 seg_size = 0; 398 sgmap = sg;
395 max_seg_size = dma_get_max_seg_size(dev); 399 seg_size = 0;
396 ps = NULL; /* shut up gcc */ 400 max_seg_size = dma_get_max_seg_size(dev);
401 ps = NULL; /* shut up gcc */
402
397 for_each_sg(sg, s, nents, i) { 403 for_each_sg(sg, s, nents, i) {
398 dma_addr_t addr = sg_phys(s); 404 dma_addr_t addr = sg_phys(s);
399 405
@@ -416,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
416 sgmap, pages, need) < 0) 422 sgmap, pages, need) < 0)
417 goto error; 423 goto error;
418 out++; 424 out++;
419 seg_size = 0; 425
420 sgmap = sg_next(sgmap); 426 seg_size = 0;
421 pages = 0; 427 sgmap = sg_next(sgmap);
422 start = i; 428 pages = 0;
423 start_sg = s; 429 start = i;
430 start_sg = s;
424 } 431 }
425 } 432 }
426 433
@@ -454,7 +461,7 @@ error:
454 461
455 iommu_full(dev, pages << PAGE_SHIFT, dir); 462 iommu_full(dev, pages << PAGE_SHIFT, dir);
456 for_each_sg(sg, s, nents, i) 463 for_each_sg(sg, s, nents, i)
457 s->dma_address = bad_dma_address; 464 s->dma_address = bad_dma_addr;
458 return 0; 465 return 0;
459} 466}
460 467
@@ -478,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
478 DMA_BIDIRECTIONAL, align_mask); 485 DMA_BIDIRECTIONAL, align_mask);
479 486
480 flush_gart(); 487 flush_gart();
481 if (paddr != bad_dma_address) { 488 if (paddr != bad_dma_addr) {
482 *dma_addr = paddr; 489 *dma_addr = paddr;
483 return page_address(page); 490 return page_address(page);
484 } 491 }
@@ -498,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
498 free_pages((unsigned long)vaddr, get_order(size)); 505 free_pages((unsigned long)vaddr, get_order(size));
499} 506}
500 507
508static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
509{
510 return (dma_addr == bad_dma_addr);
511}
512
501static int no_agp; 513static int no_agp;
502 514
503static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 515static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -514,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
514 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 526 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
515 527
516 if (iommu_size < 64*1024*1024) { 528 if (iommu_size < 64*1024*1024) {
517 printk(KERN_WARNING 529 pr_warning(
518 "PCI-DMA: Warning: Small IOMMU %luMB." 530 "PCI-DMA: Warning: Small IOMMU %luMB."
519 " Consider increasing the AGP aperture in BIOS\n", 531 " Consider increasing the AGP aperture in BIOS\n",
520 iommu_size >> 20); 532 iommu_size >> 20);
@@ -569,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
569 aperture_alloc = aper_alloc; 581 aperture_alloc = aper_alloc;
570} 582}
571 583
572static int gart_resume(struct sys_device *dev) 584static void gart_fixup_northbridges(struct sys_device *dev)
573{ 585{
574 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 586 int i;
575 587
576 if (fix_up_north_bridges) { 588 if (!fix_up_north_bridges)
577 int i; 589 return;
578 590
579 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 591 pr_info("PCI-DMA: Restoring GART aperture settings\n");
580 592
581 for (i = 0; i < num_k8_northbridges; i++) { 593 for (i = 0; i < num_k8_northbridges; i++) {
582 struct pci_dev *dev = k8_northbridges[i]; 594 struct pci_dev *dev = k8_northbridges[i];
583 595
584 /* 596 /*
585 * Don't enable translations just yet. That is the next 597 * Don't enable translations just yet. That is the next
586 * step. Restore the pre-suspend aperture settings. 598 * step. Restore the pre-suspend aperture settings.
587 */ 599 */
588 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 600 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
589 aperture_order << 1); 601 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
590 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
591 aperture_alloc >> 25);
592 }
593 } 602 }
603}
604
605static int gart_resume(struct sys_device *dev)
606{
607 pr_info("PCI-DMA: Resuming GART IOMMU\n");
608
609 gart_fixup_northbridges(dev);
594 610
595 enable_gart_translations(); 611 enable_gart_translations();
596 612
@@ -603,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
603} 619}
604 620
605static struct sysdev_class gart_sysdev_class = { 621static struct sysdev_class gart_sysdev_class = {
606 .name = "gart", 622 .name = "gart",
607 .suspend = gart_suspend, 623 .suspend = gart_suspend,
608 .resume = gart_resume, 624 .resume = gart_resume,
609 625
610}; 626};
611 627
612static struct sys_device device_gart = { 628static struct sys_device device_gart = {
613 .id = 0, 629 .cls = &gart_sysdev_class,
614 .cls = &gart_sysdev_class,
615}; 630};
616 631
617/* 632/*
@@ -626,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
626 void *gatt; 641 void *gatt;
627 int i, error; 642 int i, error;
628 643
629 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 644 pr_info("PCI-DMA: Disabling AGP.\n");
645
630 aper_size = aper_base = info->aper_size = 0; 646 aper_size = aper_base = info->aper_size = 0;
631 dev = NULL; 647 dev = NULL;
632 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
@@ -644,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
644 } 660 }
645 if (!aper_base) 661 if (!aper_base)
646 goto nommu; 662 goto nommu;
663
647 info->aper_base = aper_base; 664 info->aper_base = aper_base;
648 info->aper_size = aper_size >> 20; 665 info->aper_size = aper_size >> 20;
649 666
@@ -666,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
666 683
667 flush_gart(); 684 flush_gart();
668 685
669 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 686 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
670 aper_base, aper_size>>10); 687 aper_base, aper_size>>10);
671 688
672 return 0; 689 return 0;
673 690
674 nommu: 691 nommu:
675 /* Should not happen anymore */ 692 /* Should not happen anymore */
676 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 693 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
677 "falling back to iommu=soft.\n"); 694 "falling back to iommu=soft.\n");
678 return -1; 695 return -1;
679} 696}
@@ -685,14 +702,16 @@ static struct dma_map_ops gart_dma_ops = {
685 .unmap_page = gart_unmap_page, 702 .unmap_page = gart_unmap_page,
686 .alloc_coherent = gart_alloc_coherent, 703 .alloc_coherent = gart_alloc_coherent,
687 .free_coherent = gart_free_coherent, 704 .free_coherent = gart_free_coherent,
705 .mapping_error = gart_mapping_error,
688}; 706};
689 707
690void gart_iommu_shutdown(void) 708static void gart_iommu_shutdown(void)
691{ 709{
692 struct pci_dev *dev; 710 struct pci_dev *dev;
693 int i; 711 int i;
694 712
695 if (no_agp && (dma_ops != &gart_dma_ops)) 713 /* don't shutdown it if there is AGP installed */
714 if (!no_agp)
696 return; 715 return;
697 716
698 for (i = 0; i < num_k8_northbridges; i++) { 717 for (i = 0; i < num_k8_northbridges; i++) {
@@ -707,7 +726,7 @@ void gart_iommu_shutdown(void)
707 } 726 }
708} 727}
709 728
710void __init gart_iommu_init(void) 729int __init gart_iommu_init(void)
711{ 730{
712 struct agp_kern_info info; 731 struct agp_kern_info info;
713 unsigned long iommu_start; 732 unsigned long iommu_start;
@@ -717,7 +736,7 @@ void __init gart_iommu_init(void)
717 long i; 736 long i;
718 737
719 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 738 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
720 return; 739 return 0;
721 740
722#ifndef CONFIG_AGP_AMD64 741#ifndef CONFIG_AGP_AMD64
723 no_agp = 1; 742 no_agp = 1;
@@ -729,35 +748,28 @@ void __init gart_iommu_init(void)
729 (agp_copy_info(agp_bridge, &info) < 0); 748 (agp_copy_info(agp_bridge, &info) < 0);
730#endif 749#endif
731 750
732 if (swiotlb)
733 return;
734
735 /* Did we detect a different HW IOMMU? */
736 if (iommu_detected && !gart_iommu_aperture)
737 return;
738
739 if (no_iommu || 751 if (no_iommu ||
740 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 752 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
741 !gart_iommu_aperture || 753 !gart_iommu_aperture ||
742 (no_agp && init_k8_gatt(&info) < 0)) { 754 (no_agp && init_k8_gatt(&info) < 0)) {
743 if (max_pfn > MAX_DMA32_PFN) { 755 if (max_pfn > MAX_DMA32_PFN) {
744 printk(KERN_WARNING "More than 4GB of memory " 756 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
745 "but GART IOMMU not available.\n"); 757 pr_warning("falling back to iommu=soft.\n");
746 printk(KERN_WARNING "falling back to iommu=soft.\n");
747 } 758 }
748 return; 759 return 0;
749 } 760 }
750 761
751 /* need to map that range */ 762 /* need to map that range */
752 aper_size = info.aper_size << 20; 763 aper_size = info.aper_size << 20;
753 aper_base = info.aper_base; 764 aper_base = info.aper_base;
754 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 765 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
766
755 if (end_pfn > max_low_pfn_mapped) { 767 if (end_pfn > max_low_pfn_mapped) {
756 start_pfn = (aper_base>>PAGE_SHIFT); 768 start_pfn = (aper_base>>PAGE_SHIFT);
757 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 769 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
758 } 770 }
759 771
760 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 772 pr_info("PCI-DMA: using GART IOMMU.\n");
761 iommu_size = check_iommu_size(info.aper_base, aper_size); 773 iommu_size = check_iommu_size(info.aper_base, aper_size);
762 iommu_pages = iommu_size >> PAGE_SHIFT; 774 iommu_pages = iommu_size >> PAGE_SHIFT;
763 775
@@ -772,8 +784,7 @@ void __init gart_iommu_init(void)
772 784
773 ret = dma_debug_resize_entries(iommu_pages); 785 ret = dma_debug_resize_entries(iommu_pages);
774 if (ret) 786 if (ret)
775 printk(KERN_DEBUG 787 pr_debug("PCI-DMA: Cannot trace all the entries\n");
776 "PCI-DMA: Cannot trace all the entries\n");
777 } 788 }
778#endif 789#endif
779 790
@@ -781,17 +792,16 @@ void __init gart_iommu_init(void)
781 * Out of IOMMU space handling. 792 * Out of IOMMU space handling.
782 * Reserve some invalid pages at the beginning of the GART. 793 * Reserve some invalid pages at the beginning of the GART.
783 */ 794 */
784 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 795 bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
785 796
786 agp_memory_reserved = iommu_size; 797 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
787 printk(KERN_INFO
788 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
789 iommu_size >> 20); 798 iommu_size >> 20);
790 799
791 iommu_start = aper_size - iommu_size; 800 agp_memory_reserved = iommu_size;
792 iommu_bus_base = info.aper_base + iommu_start; 801 iommu_start = aper_size - iommu_size;
793 bad_dma_address = iommu_bus_base; 802 iommu_bus_base = info.aper_base + iommu_start;
794 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 803 bad_dma_addr = iommu_bus_base;
804 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
795 805
796 /* 806 /*
797 * Unmap the IOMMU part of the GART. The alias of the page is 807 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -813,7 +823,7 @@ void __init gart_iommu_init(void)
813 * the pages as Not-Present: 823 * the pages as Not-Present:
814 */ 824 */
815 wbinvd(); 825 wbinvd();
816 826
817 /* 827 /*
818 * Now all caches are flushed and we can safely enable 828 * Now all caches are flushed and we can safely enable
819 * GART hardware. Doing it early leaves the possibility 829 * GART hardware. Doing it early leaves the possibility
@@ -837,6 +847,10 @@ void __init gart_iommu_init(void)
837 847
838 flush_gart(); 848 flush_gart();
839 dma_ops = &gart_dma_ops; 849 dma_ops = &gart_dma_ops;
850 x86_platform.iommu_shutdown = gart_iommu_shutdown;
851 swiotlb = 0;
852
853 return 0;
840} 854}
841 855
842void __init gart_parse_options(char *p) 856void __init gart_parse_options(char *p)
@@ -855,7 +869,7 @@ void __init gart_parse_options(char *p)
855#endif 869#endif
856 if (isdigit(*p) && get_option(&p, &arg)) 870 if (isdigit(*p) && get_option(&p, &arg))
857 iommu_size = arg; 871 iommu_size = arg;
858 if (!strncmp(p, "fullflush", 8)) 872 if (!strncmp(p, "fullflush", 9))
859 iommu_fullflush = 1; 873 iommu_fullflush = 1;
860 if (!strncmp(p, "nofullflush", 11)) 874 if (!strncmp(p, "nofullflush", 11))
861 iommu_fullflush = 0; 875 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..22be12b60a8f 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 33 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 34 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 36 return DMA_ERROR_CODE;
37 flush_write_buffers(); 37 flush_write_buffers();
38 return bus; 38 return bus;
39} 39}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 104 .is_phys = 1,
105}; 105};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..7d2829dde20e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_detect - set swiotlb to 1 if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_detect(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
63 return use_swiotlb;
64}
65
66void __init pci_swiotlb_init(void)
67{
54 if (swiotlb) { 68 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 69 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 70 dma_ops = &swiotlb_dma_ops;
58 } 71 }
59} 72}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..c6ee241c8a98 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,11 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h>
14#include <linux/utsname.h>
12#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 17#include <asm/system.h>
14#include <asm/apic.h> 18#include <asm/apic.h>
15#include <asm/syscalls.h> 19#include <asm/syscalls.h>
@@ -17,6 +21,7 @@
17#include <asm/uaccess.h> 21#include <asm/uaccess.h>
18#include <asm/i387.h> 22#include <asm/i387.h>
19#include <asm/ds.h> 23#include <asm/ds.h>
24#include <asm/debugreg.h>
20 25
21unsigned long idle_halt; 26unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 27EXPORT_SYMBOL(idle_halt);
@@ -87,6 +92,25 @@ void exit_thread(void)
87 } 92 }
88} 93}
89 94
95void show_regs_common(void)
96{
97 const char *board, *product;
98
99 board = dmi_get_system_info(DMI_BOARD_NAME);
100 if (!board)
101 board = "";
102 product = dmi_get_system_info(DMI_PRODUCT_NAME);
103 if (!product)
104 product = "";
105
106 printk(KERN_CONT "\n");
107 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
108 current->pid, current->comm, print_tainted(),
109 init_utsname()->release,
110 (int)strcspn(init_utsname()->version, " "),
111 init_utsname()->version, board, product);
112}
113
90void flush_thread(void) 114void flush_thread(void)
91{ 115{
92 struct task_struct *tsk = current; 116 struct task_struct *tsk = current;
@@ -103,14 +127,7 @@ void flush_thread(void)
103 } 127 }
104#endif 128#endif
105 129
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 130 flush_ptrace_hw_breakpoint(tsk);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 131 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 132 /*
116 * Forget coprocessor state.. 133 * Forget coprocessor state..
@@ -192,16 +209,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 209 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 210 update_debugctlmsr(next->debugctlmsr);
194 211
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 212 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 213 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 214 /* prev and next are different */
@@ -224,6 +231,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
224 */ 231 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 232 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 } 233 }
234 propagate_user_return_notify(prev_p, next_p);
227} 235}
228 236
229int sys_fork(struct pt_regs *regs) 237int sys_fork(struct pt_regs *regs)
@@ -247,6 +255,76 @@ int sys_vfork(struct pt_regs *regs)
247 NULL, NULL); 255 NULL, NULL);
248} 256}
249 257
258long
259sys_clone(unsigned long clone_flags, unsigned long newsp,
260 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
261{
262 if (!newsp)
263 newsp = regs->sp;
264 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
265}
266
267/*
268 * This gets run with %si containing the
269 * function to call, and %di containing
270 * the "args".
271 */
272extern void kernel_thread_helper(void);
273
274/*
275 * Create a kernel thread
276 */
277int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
278{
279 struct pt_regs regs;
280
281 memset(&regs, 0, sizeof(regs));
282
283 regs.si = (unsigned long) fn;
284 regs.di = (unsigned long) arg;
285
286#ifdef CONFIG_X86_32
287 regs.ds = __USER_DS;
288 regs.es = __USER_DS;
289 regs.fs = __KERNEL_PERCPU;
290 regs.gs = __KERNEL_STACK_CANARY;
291#endif
292
293 regs.orig_ax = -1;
294 regs.ip = (unsigned long) kernel_thread_helper;
295 regs.cs = __KERNEL_CS | get_kernel_rpl();
296 regs.flags = X86_EFLAGS_IF | 0x2;
297
298 /* Ok, create the new process.. */
299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
300}
301EXPORT_SYMBOL(kernel_thread);
302
303/*
304 * sys_execve() executes a new program.
305 */
306long sys_execve(char __user *name, char __user * __user *argv,
307 char __user * __user *envp, struct pt_regs *regs)
308{
309 long error;
310 char *filename;
311
312 filename = getname(name);
313 error = PTR_ERR(filename);
314 if (IS_ERR(filename))
315 return error;
316 error = do_execve(filename, argv, envp, regs);
317
318#ifdef CONFIG_X86_32
319 if (error == 0) {
320 /* Make sure we don't return using sysenter.. */
321 set_thread_flag(TIF_IRET);
322 }
323#endif
324
325 putname(filename);
326 return error;
327}
250 328
251/* 329/*
252 * Idle related variables and functions 330 * Idle related variables and functions
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..37ad1e046aae 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/user.h> 24#include <linux/user.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/utsname.h>
27#include <linux/delay.h> 26#include <linux/delay.h>
28#include <linux/reboot.h> 27#include <linux/reboot.h>
29#include <linux/init.h> 28#include <linux/init.h>
@@ -35,7 +34,6 @@
35#include <linux/tick.h> 34#include <linux/tick.h>
36#include <linux/percpu.h> 35#include <linux/percpu.h>
37#include <linux/prctl.h> 36#include <linux/prctl.h>
38#include <linux/dmi.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/io.h> 39#include <linux/io.h>
@@ -58,6 +56,7 @@
58#include <asm/idle.h> 56#include <asm/idle.h>
59#include <asm/syscalls.h> 57#include <asm/syscalls.h>
60#include <asm/ds.h> 58#include <asm/ds.h>
59#include <asm/debugreg.h>
61 60
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 62
@@ -127,39 +126,29 @@ void __show_regs(struct pt_regs *regs, int all)
127 unsigned long d0, d1, d2, d3, d6, d7; 126 unsigned long d0, d1, d2, d3, d6, d7;
128 unsigned long sp; 127 unsigned long sp;
129 unsigned short ss, gs; 128 unsigned short ss, gs;
130 const char *board;
131 129
132 if (user_mode_vm(regs)) { 130 if (user_mode_vm(regs)) {
133 sp = regs->sp; 131 sp = regs->sp;
134 ss = regs->ss & 0xffff; 132 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 133 gs = get_user_gs(regs);
136 } else { 134 } else {
137 sp = (unsigned long) (&regs->sp); 135 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 136 savesegment(ss, ss);
139 savesegment(gs, gs); 137 savesegment(gs, gs);
140 } 138 }
141 139
142 printk("\n"); 140 show_regs_common();
143 141
144 board = dmi_get_system_info(DMI_PRODUCT_NAME); 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
145 if (!board)
146 board = "";
147 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
148 task_pid_nr(current), current->comm,
149 print_tainted(), init_utsname()->release,
150 (int)strcspn(init_utsname()->version, " "),
151 init_utsname()->version, board);
152
153 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
154 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
155 smp_processor_id()); 144 smp_processor_id());
156 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
157 146
158 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
159 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
160 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
161 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
162 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
163 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
164 153
165 if (!all) 154 if (!all)
@@ -169,61 +158,28 @@ void __show_regs(struct pt_regs *regs, int all)
169 cr2 = read_cr2(); 158 cr2 = read_cr2();
170 cr3 = read_cr3(); 159 cr3 = read_cr3();
171 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
172 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
173 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
174 163
175 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
176 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
177 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
178 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
179 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
180 d0, d1, d2, d3); 169 d0, d1, d2, d3);
181 170
182 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
183 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
184 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
185 d6, d7); 174 d6, d7);
186} 175}
187 176
188void show_regs(struct pt_regs *regs) 177void show_regs(struct pt_regs *regs)
189{ 178{
190 __show_regs(regs, 1); 179 show_registers(regs);
191 show_trace(NULL, regs, &regs->sp, regs->bp); 180 show_trace(NULL, regs, &regs->sp, regs->bp);
192} 181}
193 182
194/*
195 * This gets run with %bx containing the
196 * function to call, and %dx containing
197 * the "args".
198 */
199extern void kernel_thread_helper(void);
200
201/*
202 * Create a kernel thread
203 */
204int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
205{
206 struct pt_regs regs;
207
208 memset(&regs, 0, sizeof(regs));
209
210 regs.bx = (unsigned long) fn;
211 regs.dx = (unsigned long) arg;
212
213 regs.ds = __USER_DS;
214 regs.es = __USER_DS;
215 regs.fs = __KERNEL_PERCPU;
216 regs.gs = __KERNEL_STACK_CANARY;
217 regs.orig_ax = -1;
218 regs.ip = (unsigned long) kernel_thread_helper;
219 regs.cs = __KERNEL_CS | get_kernel_rpl();
220 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
221
222 /* Ok, create the new process.. */
223 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
224}
225EXPORT_SYMBOL(kernel_thread);
226
227void release_thread(struct task_struct *dead_task) 183void release_thread(struct task_struct *dead_task)
228{ 184{
229 BUG_ON(dead_task->mm); 185 BUG_ON(dead_task->mm);
@@ -259,7 +215,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 215
260 task_user_gs(p) = get_user_gs(regs); 216 task_user_gs(p) = get_user_gs(regs);
261 217
218 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 219 tsk = current;
220 err = -ENOMEM;
221
222 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
223
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 224 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 225 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 226 IO_BITMAP_BYTES, GFP_KERNEL);
@@ -430,46 +391,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
430 return prev_p; 391 return prev_p;
431} 392}
432 393
433int sys_clone(struct pt_regs *regs)
434{
435 unsigned long clone_flags;
436 unsigned long newsp;
437 int __user *parent_tidptr, *child_tidptr;
438
439 clone_flags = regs->bx;
440 newsp = regs->cx;
441 parent_tidptr = (int __user *)regs->dx;
442 child_tidptr = (int __user *)regs->di;
443 if (!newsp)
444 newsp = regs->sp;
445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
446}
447
448/*
449 * sys_execve() executes a new program.
450 */
451int sys_execve(struct pt_regs *regs)
452{
453 int error;
454 char *filename;
455
456 filename = getname((char __user *) regs->bx);
457 error = PTR_ERR(filename);
458 if (IS_ERR(filename))
459 goto out;
460 error = do_execve(filename,
461 (char __user * __user *) regs->cx,
462 (char __user * __user *) regs->dx,
463 regs);
464 if (error == 0) {
465 /* Make sure we don't return using sysenter.. */
466 set_thread_flag(TIF_IRET);
467 }
468 putname(filename);
469out:
470 return error;
471}
472
473#define top_esp (THREAD_SIZE - sizeof(unsigned long)) 394#define top_esp (THREAD_SIZE - sizeof(unsigned long))
474#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) 395#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
475 396
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ad535b683170..f9e033150cdf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/user.h> 27#include <linux/user.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/module.h> 30#include <linux/module.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
38#include <linux/uaccess.h> 37#include <linux/uaccess.h>
39#include <linux/io.h> 38#include <linux/io.h>
40#include <linux/ftrace.h> 39#include <linux/ftrace.h>
41#include <linux/dmi.h>
42 40
43#include <asm/pgtable.h> 41#include <asm/pgtable.h>
44#include <asm/system.h> 42#include <asm/system.h>
@@ -52,14 +50,13 @@
52#include <asm/idle.h> 50#include <asm/idle.h>
53#include <asm/syscalls.h> 51#include <asm/syscalls.h>
54#include <asm/ds.h> 52#include <asm/ds.h>
53#include <asm/debugreg.h>
55 54
56asmlinkage extern void ret_from_fork(void); 55asmlinkage extern void ret_from_fork(void);
57 56
58DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle); 58static DEFINE_PER_CPU(unsigned char, is_idle);
60 59
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 60static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 61
65void idle_notifier_register(struct notifier_block *n) 62void idle_notifier_register(struct notifier_block *n)
@@ -162,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all)
162 unsigned long d0, d1, d2, d3, d6, d7; 159 unsigned long d0, d1, d2, d3, d6, d7;
163 unsigned int fsindex, gsindex; 160 unsigned int fsindex, gsindex;
164 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
165 const char *board; 162
166 163 show_regs_common();
167 printk("\n"); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
168 print_modules();
169 board = dmi_get_system_info(DMI_PRODUCT_NAME);
170 if (!board)
171 board = "";
172 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
173 current->pid, current->comm, print_tainted(),
174 init_utsname()->release,
175 (int)strcspn(init_utsname()->version, " "),
176 init_utsname()->version, board);
177 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
178 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
179 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
180 regs->sp, regs->flags); 167 regs->sp, regs->flags);
181 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
182 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
183 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
184 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
185 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
186 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
187 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
188 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
189 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
190 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
191 178
192 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -207,27 +194,26 @@ void __show_regs(struct pt_regs *regs, int all)
207 cr3 = read_cr3(); 194 cr3 = read_cr3();
208 cr4 = read_cr4(); 195 cr4 = read_cr4();
209 196
210 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
211 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
212 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
213 es, cr0); 200 es, cr0);
214 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
215 cr4); 202 cr4);
216 203
217 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
218 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
219 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
220 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
222 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
223 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
224 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
225} 212}
226 213
227void show_regs(struct pt_regs *regs) 214void show_regs(struct pt_regs *regs)
228{ 215{
229 printk(KERN_INFO "CPU %d:", smp_processor_id()); 216 show_registers(regs);
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 218}
233 219
@@ -285,8 +271,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
285 *childregs = *regs; 271 *childregs = *regs;
286 272
287 childregs->ax = 0; 273 childregs->ax = 0;
288 childregs->sp = sp; 274 if (user_mode(regs))
289 if (sp == ~0UL) 275 childregs->sp = sp;
276 else
290 childregs->sp = (unsigned long)childregs; 277 childregs->sp = (unsigned long)childregs;
291 278
292 p->thread.sp = (unsigned long) childregs; 279 p->thread.sp = (unsigned long) childregs;
@@ -297,12 +284,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 284
298 p->thread.fs = me->thread.fs; 285 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 286 p->thread.gs = me->thread.gs;
287 p->thread.io_bitmap_ptr = NULL;
300 288
301 savesegment(gs, p->thread.gsindex); 289 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 290 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 291 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 292 savesegment(ds, p->thread.ds);
305 293
294 err = -ENOMEM;
295 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
296
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 297 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 298 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 299 if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +332,46 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 332 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 333 p->thread.io_bitmap_max = 0;
343 } 334 }
335
344 return err; 336 return err;
345} 337}
346 338
347void 339static void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 340start_thread_common(struct pt_regs *regs, unsigned long new_ip,
341 unsigned long new_sp,
342 unsigned int _cs, unsigned int _ss, unsigned int _ds)
349{ 343{
350 loadsegment(fs, 0); 344 loadsegment(fs, 0);
351 loadsegment(es, 0); 345 loadsegment(es, _ds);
352 loadsegment(ds, 0); 346 loadsegment(ds, _ds);
353 load_gs_index(0); 347 load_gs_index(0);
354 regs->ip = new_ip; 348 regs->ip = new_ip;
355 regs->sp = new_sp; 349 regs->sp = new_sp;
356 percpu_write(old_rsp, new_sp); 350 percpu_write(old_rsp, new_sp);
357 regs->cs = __USER_CS; 351 regs->cs = _cs;
358 regs->ss = __USER_DS; 352 regs->ss = _ss;
359 regs->flags = 0x200; 353 regs->flags = X86_EFLAGS_IF;
360 set_fs(USER_DS); 354 set_fs(USER_DS);
361 /* 355 /*
362 * Free the old FP and other extended state 356 * Free the old FP and other extended state
363 */ 357 */
364 free_thread_xstate(current); 358 free_thread_xstate(current);
365} 359}
366EXPORT_SYMBOL_GPL(start_thread); 360
361void
362start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
363{
364 start_thread_common(regs, new_ip, new_sp,
365 __USER_CS, __USER_DS, 0);
366}
367
368#ifdef CONFIG_IA32_EMULATION
369void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
370{
371 start_thread_common(regs, new_ip, new_sp,
372 __USER32_CS, __USER32_DS, __USER32_DS);
373}
374#endif
367 375
368/* 376/*
369 * switch_to(x,y) should switch tasks from x to y. 377 * switch_to(x,y) should switch tasks from x to y.
@@ -495,26 +503,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 503 */
496 if (preload_fpu) 504 if (preload_fpu)
497 __math_state_restore(); 505 __math_state_restore();
498 return prev_p;
499}
500 506
501/* 507 return prev_p;
502 * sys_execve() executes a new program.
503 */
504asmlinkage
505long sys_execve(char __user *name, char __user * __user *argv,
506 char __user * __user *envp, struct pt_regs *regs)
507{
508 long error;
509 char *filename;
510
511 filename = getname(name);
512 error = PTR_ERR(filename);
513 if (IS_ERR(filename))
514 return error;
515 error = do_execve(filename, argv, envp, regs);
516 putname(filename);
517 return error;
518} 508}
519 509
520void set_personality_64bit(void) 510void set_personality_64bit(void)
@@ -531,15 +521,6 @@ void set_personality_64bit(void)
531 current->personality &= ~READ_IMPLIES_EXEC; 521 current->personality &= ~READ_IMPLIES_EXEC;
532} 522}
533 523
534asmlinkage long
535sys_clone(unsigned long clone_flags, unsigned long newsp,
536 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537{
538 if (!newsp)
539 newsp = regs->sp;
540 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
541}
542
543unsigned long get_wchan(struct task_struct *p) 524unsigned long get_wchan(struct task_struct *p)
544{ 525{
545 unsigned long stack; 526 unsigned long stack;
@@ -664,3 +645,8 @@ long sys_arch_prctl(int code, unsigned long addr)
664 return do_arch_prctl(current, code, addr); 645 return do_arch_prctl(current, code, addr);
665} 646}
666 647
648unsigned long KSTK_ESP(struct task_struct *task)
649{
650 return (test_tsk_thread_flag(task, TIF_IA32)) ?
651 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
652}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..017d937639fe 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -49,6 +52,118 @@ enum x86_regset {
49 REGSET_IOPERM32, 52 REGSET_IOPERM32,
50}; 53};
51 54
55struct pt_regs_offset {
56 const char *name;
57 int offset;
58};
59
60#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
61#define REG_OFFSET_END {.name = NULL, .offset = 0}
62
63static const struct pt_regs_offset regoffset_table[] = {
64#ifdef CONFIG_X86_64
65 REG_OFFSET_NAME(r15),
66 REG_OFFSET_NAME(r14),
67 REG_OFFSET_NAME(r13),
68 REG_OFFSET_NAME(r12),
69 REG_OFFSET_NAME(r11),
70 REG_OFFSET_NAME(r10),
71 REG_OFFSET_NAME(r9),
72 REG_OFFSET_NAME(r8),
73#endif
74 REG_OFFSET_NAME(bx),
75 REG_OFFSET_NAME(cx),
76 REG_OFFSET_NAME(dx),
77 REG_OFFSET_NAME(si),
78 REG_OFFSET_NAME(di),
79 REG_OFFSET_NAME(bp),
80 REG_OFFSET_NAME(ax),
81#ifdef CONFIG_X86_32
82 REG_OFFSET_NAME(ds),
83 REG_OFFSET_NAME(es),
84 REG_OFFSET_NAME(fs),
85 REG_OFFSET_NAME(gs),
86#endif
87 REG_OFFSET_NAME(orig_ax),
88 REG_OFFSET_NAME(ip),
89 REG_OFFSET_NAME(cs),
90 REG_OFFSET_NAME(flags),
91 REG_OFFSET_NAME(sp),
92 REG_OFFSET_NAME(ss),
93 REG_OFFSET_END,
94};
95
96/**
97 * regs_query_register_offset() - query register offset from its name
98 * @name: the name of a register
99 *
100 * regs_query_register_offset() returns the offset of a register in struct
101 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
102 */
103int regs_query_register_offset(const char *name)
104{
105 const struct pt_regs_offset *roff;
106 for (roff = regoffset_table; roff->name != NULL; roff++)
107 if (!strcmp(roff->name, name))
108 return roff->offset;
109 return -EINVAL;
110}
111
112/**
113 * regs_query_register_name() - query register name from its offset
114 * @offset: the offset of a register in struct pt_regs.
115 *
116 * regs_query_register_name() returns the name of a register from its
117 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
118 */
119const char *regs_query_register_name(unsigned int offset)
120{
121 const struct pt_regs_offset *roff;
122 for (roff = regoffset_table; roff->name != NULL; roff++)
123 if (roff->offset == offset)
124 return roff->name;
125 return NULL;
126}
127
128static const int arg_offs_table[] = {
129#ifdef CONFIG_X86_32
130 [0] = offsetof(struct pt_regs, ax),
131 [1] = offsetof(struct pt_regs, dx),
132 [2] = offsetof(struct pt_regs, cx)
133#else /* CONFIG_X86_64 */
134 [0] = offsetof(struct pt_regs, di),
135 [1] = offsetof(struct pt_regs, si),
136 [2] = offsetof(struct pt_regs, dx),
137 [3] = offsetof(struct pt_regs, cx),
138 [4] = offsetof(struct pt_regs, r8),
139 [5] = offsetof(struct pt_regs, r9)
140#endif
141};
142
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
52/* 167/*
53 * does not yet catch signals sent when the child dies. 168 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 169 * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 252 return 0;
138} 253}
139 254
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
146 256
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 376 return 0;
267} 377}
268 378
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
279 380
280static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -408,14 +509,14 @@ static int genregs_get(struct task_struct *target,
408{ 509{
409 if (kbuf) { 510 if (kbuf) {
410 unsigned long *k = kbuf; 511 unsigned long *k = kbuf;
411 while (count > 0) { 512 while (count >= sizeof(*k)) {
412 *k++ = getreg(target, pos); 513 *k++ = getreg(target, pos);
413 count -= sizeof(*k); 514 count -= sizeof(*k);
414 pos += sizeof(*k); 515 pos += sizeof(*k);
415 } 516 }
416 } else { 517 } else {
417 unsigned long __user *u = ubuf; 518 unsigned long __user *u = ubuf;
418 while (count > 0) { 519 while (count >= sizeof(*u)) {
419 if (__put_user(getreg(target, pos), u++)) 520 if (__put_user(getreg(target, pos), u++))
420 return -EFAULT; 521 return -EFAULT;
421 count -= sizeof(*u); 522 count -= sizeof(*u);
@@ -434,14 +535,14 @@ static int genregs_set(struct task_struct *target,
434 int ret = 0; 535 int ret = 0;
435 if (kbuf) { 536 if (kbuf) {
436 const unsigned long *k = kbuf; 537 const unsigned long *k = kbuf;
437 while (count > 0 && !ret) { 538 while (count >= sizeof(*k) && !ret) {
438 ret = putreg(target, pos, *k++); 539 ret = putreg(target, pos, *k++);
439 count -= sizeof(*k); 540 count -= sizeof(*k);
440 pos += sizeof(*k); 541 pos += sizeof(*k);
441 } 542 }
442 } else { 543 } else {
443 const unsigned long __user *u = ubuf; 544 const unsigned long __user *u = ubuf;
444 while (count > 0 && !ret) { 545 while (count >= sizeof(*u) && !ret) {
445 unsigned long word; 546 unsigned long word;
446 ret = __get_user(word, u++); 547 ret = __get_user(word, u++);
447 if (ret) 548 if (ret)
@@ -454,99 +555,237 @@ static int genregs_set(struct task_struct *target,
454 return ret; 555 return ret;
455} 556}
456 557
558static void ptrace_triggered(struct perf_event *bp, int nmi,
559 struct perf_sample_data *data,
560 struct pt_regs *regs)
561{
562 int i;
563 struct thread_struct *thread = &(current->thread);
564
565 /*
566 * Store in the virtual DR6 register the fact that the breakpoint
567 * was hit so the thread's debugger will see it.
568 */
569 for (i = 0; i < HBP_NUM; i++) {
570 if (thread->ptrace_bps[i] == bp)
571 break;
572 }
573
574 thread->debugreg6 |= (DR_TRAP0 << i);
575}
576
457/* 577/*
458 * This function is trivial and will be inlined by the compiler. 578 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 579 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 580 *
461 */ 581 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 582static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 583{
464 switch (n) { 584 int i;
465 case 0: return child->thread.debugreg0; 585 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 586 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 587
468 case 3: return child->thread.debugreg3; 588 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 589 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 590 info = counter_arch_bp(bp[i]);
591 dr7 |= encode_dr7(i, info->len, info->type);
592 }
471 } 593 }
472 return 0; 594
595 return dr7;
473} 596}
474 597
475static int ptrace_set_debugreg(struct task_struct *child, 598static int
476 int n, unsigned long data) 599ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
600 struct task_struct *tsk, int disabled)
477{ 601{
478 int i; 602 int err;
603 int gen_len, gen_type;
604 struct perf_event_attr attr;
479 605
480 if (unlikely(n == 4 || n == 5)) 606 /*
481 return -EIO; 607 * We shoud have at least an inactive breakpoint at this
608 * slot. It means the user is writing dr7 without having
609 * written the address register first
610 */
611 if (!bp)
612 return -EINVAL;
482 613
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 614 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 615 if (err)
616 return err;
485 617
486 switch (n) { 618 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 619 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 620 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 621 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 622
492 case 6: 623 return modify_user_hw_breakpoint(bp, &attr);
493 if ((data & ~0xffffffffUL) != 0) 624}
494 return -EIO;
495 child->thread.debugreg6 = data;
496 break;
497 625
498 case 7: 626/*
627 * Handle ptrace writes to debug register 7.
628 */
629static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
630{
631 struct thread_struct *thread = &(tsk->thread);
632 unsigned long old_dr7;
633 int i, orig_ret = 0, rc = 0;
634 int enabled, second_pass = 0;
635 unsigned len, type;
636 struct perf_event *bp;
637
638 data &= ~DR_CONTROL_RESERVED;
639 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
640restore:
641 /*
642 * Loop through all the hardware breakpoints, making the
643 * appropriate changes to each.
644 */
645 for (i = 0; i < HBP_NUM; i++) {
646 enabled = decode_dr7(data, i, &len, &type);
647 bp = thread->ptrace_bps[i];
648
649 if (!enabled) {
650 if (bp) {
651 /*
652 * Don't unregister the breakpoints right-away,
653 * unless all register_user_hw_breakpoint()
654 * requests have succeeded. This prevents
655 * any window of opportunity for debug
656 * register grabbing by other users.
657 */
658 if (!second_pass)
659 continue;
660
661 rc = ptrace_modify_breakpoint(bp, len, type,
662 tsk, 1);
663 if (rc)
664 break;
665 }
666 continue;
667 }
668
669 rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
670 if (rc)
671 break;
672 }
673 /*
674 * Make a second pass to free the remaining unused breakpoints
675 * or to restore the original breakpoints if an error occurred.
676 */
677 if (!second_pass) {
678 second_pass = 1;
679 if (rc < 0) {
680 orig_ret = rc;
681 data = old_dr7;
682 }
683 goto restore;
684 }
685 return ((orig_ret < 0) ? orig_ret : rc);
686}
687
688/*
689 * Handle PTRACE_PEEKUSR calls for the debug register area.
690 */
691static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
692{
693 struct thread_struct *thread = &(tsk->thread);
694 unsigned long val = 0;
695
696 if (n < HBP_NUM) {
697 struct perf_event *bp;
698 bp = thread->ptrace_bps[n];
699 if (!bp)
700 return 0;
701 val = bp->hw.info.address;
702 } else if (n == 6) {
703 val = thread->debugreg6;
704 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps);
706 }
707 return val;
708}
709
710static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
711 unsigned long addr)
712{
713 struct perf_event *bp;
714 struct thread_struct *t = &tsk->thread;
715 struct perf_event_attr attr;
716
717 if (!t->ptrace_bps[nr]) {
718 hw_breakpoint_init(&attr);
499 /* 719 /*
500 * Sanity-check data. Take one half-byte at once with 720 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 721 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 722 */
529#ifdef CONFIG_X86_32 723 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 724 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 725 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 726 attr.disabled = 1;
533#endif 727
534 data &= ~DR_CONTROL_RESERVED; 728 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 729
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 730 /*
537 return -EIO; 731 * CHECKME: the previous code returned -EIO if the addr wasn't
538 child->thread.debugreg7 = data; 732 * a valid task virtual addr. The new one will return -EINVAL in
539 if (data) 733 * this case.
540 set_tsk_thread_flag(child, TIF_DEBUG); 734 * -EINVAL may be what we want for in-kernel breakpoints users,
541 else 735 * but -EIO looks better for ptrace, since we refuse a register
542 clear_tsk_thread_flag(child, TIF_DEBUG); 736 * writing for the user. And anyway this is the previous
543 break; 737 * behaviour.
738 */
739 if (IS_ERR(bp))
740 return PTR_ERR(bp);
741
742 t->ptrace_bps[nr] = bp;
743 } else {
744 int err;
745
746 bp = t->ptrace_bps[nr];
747
748 attr = bp->attr;
749 attr.bp_addr = addr;
750 err = modify_user_hw_breakpoint(bp, &attr);
751 if (err)
752 return err;
544 } 753 }
545 754
755
546 return 0; 756 return 0;
547} 757}
548 758
549/* 759/*
760 * Handle PTRACE_POKEUSR calls for the debug register area.
761 */
762int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
763{
764 struct thread_struct *thread = &(tsk->thread);
765 int rc = 0;
766
767 /* There are no DR4 or DR5 registers */
768 if (n == 4 || n == 5)
769 return -EIO;
770
771 if (n == 6) {
772 thread->debugreg6 = val;
773 goto ret_path;
774 }
775 if (n < HBP_NUM) {
776 rc = ptrace_set_breakpoint_addr(tsk, n, val);
777 if (rc)
778 return rc;
779 }
780 /* All that's left is DR7 */
781 if (n == 7)
782 rc = ptrace_write_dr7(tsk, val);
783
784ret_path:
785 return rc;
786}
787
788/*
550 * These access the current or another (stopped) task's io permission 789 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 790 * bitmap for debugging or core dump.
552 */ 791 */
@@ -1219,14 +1458,14 @@ static int genregs32_get(struct task_struct *target,
1219{ 1458{
1220 if (kbuf) { 1459 if (kbuf) {
1221 compat_ulong_t *k = kbuf; 1460 compat_ulong_t *k = kbuf;
1222 while (count > 0) { 1461 while (count >= sizeof(*k)) {
1223 getreg32(target, pos, k++); 1462 getreg32(target, pos, k++);
1224 count -= sizeof(*k); 1463 count -= sizeof(*k);
1225 pos += sizeof(*k); 1464 pos += sizeof(*k);
1226 } 1465 }
1227 } else { 1466 } else {
1228 compat_ulong_t __user *u = ubuf; 1467 compat_ulong_t __user *u = ubuf;
1229 while (count > 0) { 1468 while (count >= sizeof(*u)) {
1230 compat_ulong_t word; 1469 compat_ulong_t word;
1231 getreg32(target, pos, &word); 1470 getreg32(target, pos, &word);
1232 if (__put_user(word, u++)) 1471 if (__put_user(word, u++))
@@ -1247,14 +1486,14 @@ static int genregs32_set(struct task_struct *target,
1247 int ret = 0; 1486 int ret = 0;
1248 if (kbuf) { 1487 if (kbuf) {
1249 const compat_ulong_t *k = kbuf; 1488 const compat_ulong_t *k = kbuf;
1250 while (count > 0 && !ret) { 1489 while (count >= sizeof(*k) && !ret) {
1251 ret = putreg32(target, pos, *k++); 1490 ret = putreg32(target, pos, *k++);
1252 count -= sizeof(*k); 1491 count -= sizeof(*k);
1253 pos += sizeof(*k); 1492 pos += sizeof(*k);
1254 } 1493 }
1255 } else { 1494 } else {
1256 const compat_ulong_t __user *u = ubuf; 1495 const compat_ulong_t __user *u = ubuf;
1257 while (count > 0 && !ret) { 1496 while (count >= sizeof(*u) && !ret) {
1258 compat_ulong_t word; 1497 compat_ulong_t word;
1259 ret = __get_user(word, u++); 1498 ret = __get_user(word, u++);
1260 if (ret) 1499 if (ret)
@@ -1437,21 +1676,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1437#endif 1676#endif
1438} 1677}
1439 1678
1440void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 1679static void fill_sigtrap_info(struct task_struct *tsk,
1441 int error_code, int si_code) 1680 struct pt_regs *regs,
1681 int error_code, int si_code,
1682 struct siginfo *info)
1442{ 1683{
1443 struct siginfo info;
1444
1445 tsk->thread.trap_no = 1; 1684 tsk->thread.trap_no = 1;
1446 tsk->thread.error_code = error_code; 1685 tsk->thread.error_code = error_code;
1447 1686
1448 memset(&info, 0, sizeof(info)); 1687 memset(info, 0, sizeof(*info));
1449 info.si_signo = SIGTRAP; 1688 info->si_signo = SIGTRAP;
1450 info.si_code = si_code; 1689 info->si_code = si_code;
1690 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
1691}
1451 1692
1452 /* User-mode ip? */ 1693void user_single_step_siginfo(struct task_struct *tsk,
1453 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1694 struct pt_regs *regs,
1695 struct siginfo *info)
1696{
1697 fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
1698}
1454 1699
1700void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1701 int error_code, int si_code)
1702{
1703 struct siginfo info;
1704
1705 fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
1455 /* Send us the fake SIGTRAP */ 1706 /* Send us the fake SIGTRAP */
1456 force_sig_info(SIGTRAP, &info, tsk); 1707 force_sig_info(SIGTRAP, &info, tsk);
1457} 1708}
@@ -1516,29 +1767,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1516 1767
1517asmregparm void syscall_trace_leave(struct pt_regs *regs) 1768asmregparm void syscall_trace_leave(struct pt_regs *regs)
1518{ 1769{
1770 bool step;
1771
1519 if (unlikely(current->audit_context)) 1772 if (unlikely(current->audit_context))
1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1773 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1521 1774
1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1775 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1523 trace_sys_exit(regs, regs->ax); 1776 trace_sys_exit(regs, regs->ax);
1524 1777
1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1526 tracehook_report_syscall_exit(regs, 0);
1527
1528 /* 1778 /*
1529 * If TIF_SYSCALL_EMU is set, we only get here because of 1779 * If TIF_SYSCALL_EMU is set, we only get here because of
1530 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1780 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1531 * We already reported this syscall instruction in 1781 * We already reported this syscall instruction in
1532 * syscall_trace_enter(), so don't do any more now. 1782 * syscall_trace_enter().
1533 */
1534 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1535 return;
1536
1537 /*
1538 * If we are single-stepping, synthesize a trap to follow the
1539 * system call instruction.
1540 */ 1783 */
1541 if (test_thread_flag(TIF_SINGLESTEP) && 1784 step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
1542 tracehook_consider_fatal_signal(current, SIGTRAP)) 1785 !test_thread_flag(TIF_SYSCALL_EMU);
1543 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1786 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1787 tracehook_report_syscall_exit(regs, step);
1544} 1788}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..18093d7498f0 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -499,6 +499,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 499{
500 struct pci_dev *nb_ht; 500 struct pci_dev *nb_ht;
501 unsigned int devfn; 501 unsigned int devfn;
502 u32 node;
502 u32 val; 503 u32 val;
503 504
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 505 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +508,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 508 return;
508 509
509 pci_read_config_dword(nb_ht, 0x60, &val); 510 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 511 node = val & 7;
512 /*
513 * Some hardware may return an invalid node ID,
514 * so check it first:
515 */
516 if (node_online(node))
517 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 518 pci_dev_put(nb_ht);
512} 519}
513 520
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 27349f92a6d7..1545bc0c9845 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h>
7#include <linux/tboot.h> 8#include <linux/tboot.h>
8#include <acpi/reboot.h> 9#include <acpi/reboot.h>
9#include <asm/io.h> 10#include <asm/io.h>
@@ -22,7 +23,7 @@
22# include <linux/ctype.h> 23# include <linux/ctype.h>
23# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
24#else 25#else
25# include <asm/iommu.h> 26# include <asm/x86_init.h>
26#endif 27#endif
27 28
28/* 29/*
@@ -258,6 +259,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
258 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
259 }, 260 },
260 }, 261 },
262 { /* Handle problems with rebooting on ASUS P4S800 */
263 .callback = set_bios_reboot,
264 .ident = "ASUS P4S800",
265 .matches = {
266 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
267 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
268 },
269 },
261 { } 270 { }
262}; 271};
263 272
@@ -435,6 +444,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
435 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), 444 DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
436 }, 445 },
437 }, 446 },
447 { /* Handle problems with rebooting on Apple Macmini3,1 */
448 .callback = set_pci_reboot,
449 .ident = "Apple Macmini3,1",
450 .matches = {
451 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
452 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
453 },
454 },
438 { } 455 { }
439}; 456};
440 457
@@ -613,7 +630,7 @@ void native_machine_shutdown(void)
613#endif 630#endif
614 631
615#ifdef CONFIG_X86_64 632#ifdef CONFIG_X86_64
616 pci_iommu_shutdown(); 633 x86_platform.iommu_shutdown();
617#endif 634#endif
618} 635}
619 636
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..fda313ebbb03 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -12,7 +12,7 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/geode.h> 15#include <linux/cs5535.h>
16 16
17static void cs5530a_warm_reset(struct pci_dev *dev) 17static void cs5530a_warm_reset(struct pci_dev *dev)
18{ 18{
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e09f0e2c14b5..f7b8b9894b22 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -73,6 +73,7 @@
73 73
74#include <asm/mtrr.h> 74#include <asm/mtrr.h>
75#include <asm/apic.h> 75#include <asm/apic.h>
76#include <asm/trampoline.h>
76#include <asm/e820.h> 77#include <asm/e820.h>
77#include <asm/mpspec.h> 78#include <asm/mpspec.h>
78#include <asm/setup.h> 79#include <asm/setup.h>
@@ -106,9 +107,11 @@
106#include <asm/percpu.h> 107#include <asm/percpu.h>
107#include <asm/topology.h> 108#include <asm/topology.h>
108#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/k8.h>
109#ifdef CONFIG_X86_64 111#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 112#include <asm/numa_64.h>
111#endif 113#endif
114#include <asm/mce.h>
112 115
113/* 116/*
114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 117 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -247,7 +250,7 @@ EXPORT_SYMBOL(edd);
247 * from boot_params into a safe place. 250 * from boot_params into a safe place.
248 * 251 *
249 */ 252 */
250static inline void copy_edd(void) 253static inline void __init copy_edd(void)
251{ 254{
252 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 255 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
253 sizeof(edd.mbr_signature)); 256 sizeof(edd.mbr_signature));
@@ -256,7 +259,7 @@ static inline void copy_edd(void)
256 edd.edd_info_nr = boot_params.eddbuf_entries; 259 edd.edd_info_nr = boot_params.eddbuf_entries;
257} 260}
258#else 261#else
259static inline void copy_edd(void) 262static inline void __init copy_edd(void)
260{ 263{
261} 264}
262#endif 265#endif
@@ -486,42 +489,11 @@ static void __init reserve_early_setup_data(void)
486 489
487#ifdef CONFIG_KEXEC 490#ifdef CONFIG_KEXEC
488 491
489/**
490 * Reserve @size bytes of crashkernel memory at any suitable offset.
491 *
492 * @size: Size of the crashkernel memory to reserve.
493 * Returns the base address on success, and -1ULL on failure.
494 */
495static
496unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
497{
498 const unsigned long long alignment = 16<<20; /* 16M */
499 unsigned long long start = 0LL;
500
501 while (1) {
502 int ret;
503
504 start = find_e820_area(start, ULONG_MAX, size, alignment);
505 if (start == -1ULL)
506 return start;
507
508 /* try to reserve it */
509 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
510 if (ret >= 0)
511 return start;
512
513 start += alignment;
514 }
515}
516
517static inline unsigned long long get_total_mem(void) 492static inline unsigned long long get_total_mem(void)
518{ 493{
519 unsigned long long total; 494 unsigned long long total;
520 495
521 total = max_low_pfn - min_low_pfn; 496 total = max_pfn - min_low_pfn;
522#ifdef CONFIG_HIGHMEM
523 total += highend_pfn - highstart_pfn;
524#endif
525 497
526 return total << PAGE_SHIFT; 498 return total << PAGE_SHIFT;
527} 499}
@@ -541,21 +513,25 @@ static void __init reserve_crashkernel(void)
541 513
542 /* 0 means: find the address automatically */ 514 /* 0 means: find the address automatically */
543 if (crash_base <= 0) { 515 if (crash_base <= 0) {
544 crash_base = find_and_reserve_crashkernel(crash_size); 516 const unsigned long long alignment = 16<<20; /* 16M */
517
518 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
519 alignment);
545 if (crash_base == -1ULL) { 520 if (crash_base == -1ULL) {
546 pr_info("crashkernel reservation failed. " 521 pr_info("crashkernel reservation failed - No suitable area found.\n");
547 "No suitable area found.\n");
548 return; 522 return;
549 } 523 }
550 } else { 524 } else {
551 ret = reserve_bootmem_generic(crash_base, crash_size, 525 unsigned long long start;
552 BOOTMEM_EXCLUSIVE); 526
553 if (ret < 0) { 527 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
554 pr_info("crashkernel reservation failed - " 528 1<<20);
555 "memory is in use\n"); 529 if (start != crash_base) {
530 pr_info("crashkernel reservation failed - memory is in use.\n");
556 return; 531 return;
557 } 532 }
558 } 533 }
534 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
559 535
560 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 536 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
561 "for crashkernel (System RAM: %ldMB)\n", 537 "for crashkernel (System RAM: %ldMB)\n",
@@ -660,6 +636,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
660 }, 636 },
661 }, 637 },
662 { 638 {
639 .callback = dmi_low_memory_corruption,
640 .ident = "Phoenix/MSC BIOS",
641 .matches = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
643 },
644 },
645 {
663 /* 646 /*
664 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 647 * AMI BIOS with low memory corruption was found on Intel DG45ID board.
665 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 648 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
@@ -691,6 +674,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
691 674
692void __init setup_arch(char **cmdline_p) 675void __init setup_arch(char **cmdline_p)
693{ 676{
677 int acpi = 0;
678 int k8 = 0;
679
694#ifdef CONFIG_X86_32 680#ifdef CONFIG_X86_32
695 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 681 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
696 visws_early_detect(); 682 visws_early_detect();
@@ -783,21 +769,18 @@ void __init setup_arch(char **cmdline_p)
783 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 769 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
784 *cmdline_p = command_line; 770 *cmdline_p = command_line;
785 771
786#ifdef CONFIG_X86_64
787 /* 772 /*
788 * Must call this twice: Once just to detect whether hardware doesn't 773 * x86_configure_nx() is called before parse_early_param() to detect
789 * support NX (so that the early EHCI debug console setup can safely 774 * whether hardware doesn't support NX (so that the early EHCI debug
790 * call set_fixmap(), and then again after parsing early parameters to 775 * console setup can safely call set_fixmap()). It may then be called
791 * honor the respective command line option. 776 * again from within noexec_setup() during parsing early parameters
777 * to honor the respective command line option.
792 */ 778 */
793 check_efer(); 779 x86_configure_nx();
794#endif
795 780
796 parse_early_param(); 781 parse_early_param();
797 782
798#ifdef CONFIG_X86_64 783 x86_report_nx();
799 check_efer();
800#endif
801 784
802 /* Must be before kernel pagetables are setup */ 785 /* Must be before kernel pagetables are setup */
803 vmi_activate(); 786 vmi_activate();
@@ -893,6 +876,20 @@ void __init setup_arch(char **cmdline_p)
893 876
894 reserve_brk(); 877 reserve_brk();
895 878
879 /*
880 * Find and reserve possible boot-time SMP configuration:
881 */
882 find_smp_config();
883
884 reserve_trampoline_memory();
885
886#ifdef CONFIG_ACPI_SLEEP
887 /*
888 * Reserve low memory region for sleep support.
889 * even before init_memory_mapping
890 */
891 acpi_reserve_wakeup_memory();
892#endif
896 init_gbpages(); 893 init_gbpages();
897 894
898 /* max_pfn_mapped is updated here */ 895 /* max_pfn_mapped is updated here */
@@ -919,6 +916,8 @@ void __init setup_arch(char **cmdline_p)
919 916
920 reserve_initrd(); 917 reserve_initrd();
921 918
919 reserve_crashkernel();
920
922 vsmp_init(); 921 vsmp_init();
923 922
924 io_delay_init(); 923 io_delay_init();
@@ -934,23 +933,15 @@ void __init setup_arch(char **cmdline_p)
934 /* 933 /*
935 * Parse SRAT to discover nodes. 934 * Parse SRAT to discover nodes.
936 */ 935 */
937 acpi_numa_init(); 936 acpi = acpi_numa_init();
938#endif 937#endif
939 938
940 initmem_init(0, max_pfn); 939#ifdef CONFIG_K8_NUMA
941 940 if (!acpi)
942#ifdef CONFIG_ACPI_SLEEP 941 k8 = !k8_numa_init(0, max_pfn);
943 /*
944 * Reserve low memory region for sleep support.
945 */
946 acpi_reserve_bootmem();
947#endif 942#endif
948 /*
949 * Find and reserve possible boot-time SMP configuration:
950 */
951 find_smp_config();
952 943
953 reserve_crashkernel(); 944 initmem_init(0, max_pfn, acpi, k8);
954 945
955#ifdef CONFIG_X86_64 946#ifdef CONFIG_X86_64
956 /* 947 /*
@@ -1024,6 +1015,8 @@ void __init setup_arch(char **cmdline_p)
1024#endif 1015#endif
1025#endif 1016#endif
1026 x86_init.oem.banner(); 1017 x86_init.oem.banner();
1018
1019 mcheck_init();
1027} 1020}
1028 1021
1029#ifdef CONFIG_X86_32 1022#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..35abcb8b00e9 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/module.h> 4#include <linux/module.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,9 +22,9 @@
20#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
21 23
22#ifdef CONFIG_DEBUG_PER_CPU_MAPS 24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
23# define DBG(x...) printk(KERN_DEBUG x) 25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
24#else 26#else
25# define DBG(x...) 27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
26#endif 28#endif
27 29
28DEFINE_PER_CPU(int, cpu_number); 30DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
116 } else { 118 } else {
117 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 119 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
118 size, align, goal); 120 size, align, goal);
119 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 121 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
120 "%016lx\n", cpu, size, node, __pa(ptr)); 122 cpu, size, node, __pa(ptr));
121 } 123 }
122 return ptr; 124 return ptr;
123#else 125#else
@@ -198,8 +200,7 @@ void __init setup_per_cpu_areas(void)
198 pcpu_cpu_distance, 200 pcpu_cpu_distance,
199 pcpu_fc_alloc, pcpu_fc_free); 201 pcpu_fc_alloc, pcpu_fc_free);
200 if (rc < 0) 202 if (rc < 0)
201 pr_warning("PERCPU: %s allocator failed (%d), " 203 pr_warning("%s allocator failed (%d), falling back to page size\n",
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc); 204 pcpu_fc_names[pcpu_chosen_fc], rc);
204 } 205 }
205 if (rc < 0) 206 if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..4fd173cd8e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -544,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
544} 545}
545#endif /* CONFIG_X86_32 */ 546#endif /* CONFIG_X86_32 */
546 547
547#ifdef CONFIG_X86_32 548long
548int sys_sigaltstack(struct pt_regs *regs)
549{
550 const stack_t __user *uss = (const stack_t __user *)regs->bx;
551 stack_t __user *uoss = (stack_t __user *)regs->cx;
552
553 return do_sigaltstack(uss, uoss, regs->sp);
554}
555#else /* !CONFIG_X86_32 */
556asmlinkage long
557sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 549sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
558 struct pt_regs *regs) 550 struct pt_regs *regs)
559{ 551{
560 return do_sigaltstack(uss, uoss, regs->sp); 552 return do_sigaltstack(uss, uoss, regs->sp);
561} 553}
562#endif /* CONFIG_X86_32 */
563 554
564/* 555/*
565 * Do a signal return; undo the signal stack. 556 * Do a signal return; undo the signal stack.
@@ -799,15 +790,6 @@ static void do_signal(struct pt_regs *regs)
799 790
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 791 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 792 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 793 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 794 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 795 /*
@@ -872,6 +854,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 if (current->replacement_session_keyring) 854 if (current->replacement_session_keyring)
873 key_replace_session_keyring(); 855 key_replace_session_keyring();
874 } 856 }
857 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
858 fire_user_return_notifiers();
875 859
876#ifdef CONFIG_X86_32 860#ifdef CONFIG_X86_32
877 clear_thread_flag(TIF_IRET); 861 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..678d0b8c26f3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -671,6 +671,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
671 complete(&c_idle->done); 671 complete(&c_idle->done);
672} 672}
673 673
674/* reduce the number of lines printed when booting a large cpu count system */
675static void __cpuinit announce_cpu(int cpu, int apicid)
676{
677 static int current_node = -1;
678 int node = cpu_to_node(cpu);
679
680 if (system_state == SYSTEM_BOOTING) {
681 if (node != current_node) {
682 if (current_node > (-1))
683 pr_cont(" Ok.\n");
684 current_node = node;
685 pr_info("Booting Node %3d, Processors ", node);
686 }
687 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
688 return;
689 } else
690 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
691 node, cpu, apicid);
692}
693
674/* 694/*
675 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 695 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
676 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 696 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -687,7 +707,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 707 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
688 }; 708 };
689 709
690 INIT_WORK(&c_idle.work, do_fork_idle); 710 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
691 711
692 alternatives_smp_switch(1); 712 alternatives_smp_switch(1);
693 713
@@ -713,6 +733,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
713 733
714 if (IS_ERR(c_idle.idle)) { 734 if (IS_ERR(c_idle.idle)) {
715 printk("failed fork for CPU %d\n", cpu); 735 printk("failed fork for CPU %d\n", cpu);
736 destroy_work_on_stack(&c_idle.work);
716 return PTR_ERR(c_idle.idle); 737 return PTR_ERR(c_idle.idle);
717 } 738 }
718 739
@@ -736,9 +757,8 @@ do_rest:
736 /* start_ip had better be page-aligned! */ 757 /* start_ip had better be page-aligned! */
737 start_ip = setup_trampoline(); 758 start_ip = setup_trampoline();
738 759
739 /* So we see what's up */ 760 /* So we see what's up */
740 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", 761 announce_cpu(cpu, apicid);
741 cpu, apicid, start_ip);
742 762
743 /* 763 /*
744 * This grunge runs the startup process for 764 * This grunge runs the startup process for
@@ -787,21 +807,17 @@ do_rest:
787 udelay(100); 807 udelay(100);
788 } 808 }
789 809
790 if (cpumask_test_cpu(cpu, cpu_callin_mask)) { 810 if (cpumask_test_cpu(cpu, cpu_callin_mask))
791 /* number CPUs logically, starting from 1 (BSP is 0) */ 811 pr_debug("CPU%d: has booted.\n", cpu);
792 pr_debug("OK.\n"); 812 else {
793 printk(KERN_INFO "CPU%d: ", cpu);
794 print_cpu_info(&cpu_data(cpu));
795 pr_debug("CPU has booted.\n");
796 } else {
797 boot_error = 1; 813 boot_error = 1;
798 if (*((volatile unsigned char *)trampoline_base) 814 if (*((volatile unsigned char *)trampoline_base)
799 == 0xA5) 815 == 0xA5)
800 /* trampoline started but...? */ 816 /* trampoline started but...? */
801 printk(KERN_ERR "Stuck ??\n"); 817 pr_err("CPU%d: Stuck ??\n", cpu);
802 else 818 else
803 /* trampoline code not run */ 819 /* trampoline code not run */
804 printk(KERN_ERR "Not responding.\n"); 820 pr_err("CPU%d: Not responding.\n", cpu);
805 if (apic->inquire_remote_apic) 821 if (apic->inquire_remote_apic)
806 apic->inquire_remote_apic(apicid); 822 apic->inquire_remote_apic(apicid);
807 } 823 }
@@ -831,6 +847,7 @@ do_rest:
831 smpboot_restore_warm_reset_vector(); 847 smpboot_restore_warm_reset_vector();
832 } 848 }
833 849
850 destroy_work_on_stack(&c_idle.work);
834 return boot_error; 851 return boot_error;
835} 852}
836 853
@@ -1250,16 +1267,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1267void cpu_disable_common(void)
1251{ 1268{
1252 int cpu = smp_processor_id(); 1269 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1270
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1271 remove_siblinginfo(cpu);
1264 1272
1265 /* It's now safe to remove this processor from the online map */ 1273 /* It's now safe to remove this processor from the online map */
@@ -1300,14 +1308,16 @@ void native_cpu_die(unsigned int cpu)
1300 for (i = 0; i < 10; i++) { 1308 for (i = 0; i < 10; i++) {
1301 /* They ack this in play_dead by setting CPU_DEAD */ 1309 /* They ack this in play_dead by setting CPU_DEAD */
1302 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1310 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1303 printk(KERN_INFO "CPU %d is now offline\n", cpu); 1311 if (system_state == SYSTEM_RUNNING)
1312 pr_info("CPU %u is now offline\n", cpu);
1313
1304 if (1 == num_online_cpus()) 1314 if (1 == num_online_cpus())
1305 alternatives_smp_switch(0); 1315 alternatives_smp_switch(0);
1306 return; 1316 return;
1307 } 1317 }
1308 msleep(100); 1318 msleep(100);
1309 } 1319 }
1310 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1320 pr_err("CPU %u didn't die...\n", cpu);
1311} 1321}
1312 1322
1313void play_dead_common(void) 1323void play_dead_common(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..922eefbb3f6c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning, 56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol, 57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 58 .stack = save_stack_stack,
59 .address = save_stack_address, 59 .address = save_stack_address,
60 .walk_stack = print_context_stack,
60}; 61};
61 62
62static const struct stacktrace_ops save_stack_ops_nosched = { 63static const struct stacktrace_ops save_stack_ops_nosched = {
63 .warning = save_stack_warning, 64 .warning = save_stack_warning,
64 .warning_symbol = save_stack_warning_symbol, 65 .warning_symbol = save_stack_warning_symbol,
65 .stack = save_stack_stack, 66 .stack = save_stack_stack,
66 .address = save_stack_address_nosched, 67 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack,
67}; 69};
68 70
69/* 71/*
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..dee1ff7cba58 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,31 +24,6 @@
24 24
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
28 unsigned long prot, unsigned long flags,
29 unsigned long fd, unsigned long pgoff)
30{
31 int error = -EBADF;
32 struct file *file = NULL;
33 struct mm_struct *mm = current->mm;
34
35 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
36 if (!(flags & MAP_ANONYMOUS)) {
37 file = fget(fd);
38 if (!file)
39 goto out;
40 }
41
42 down_write(&mm->mmap_sem);
43 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
44 up_write(&mm->mmap_sem);
45
46 if (file)
47 fput(file);
48out:
49 return error;
50}
51
52/* 27/*
53 * Perform the select(nd, in, out, ex, tv) and mmap() system 28 * Perform the select(nd, in, out, ex, tv) and mmap() system
54 * calls. Linux/i386 didn't use to be able to handle more than 29 * calls. Linux/i386 didn't use to be able to handle more than
@@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
77 if (a.offset & ~PAGE_MASK) 52 if (a.offset & ~PAGE_MASK)
78 goto out; 53 goto out;
79 54
80 err = sys_mmap2(a.addr, a.len, a.prot, a.flags, 55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
81 a.fd, a.offset >> PAGE_SHIFT); 56 a.fd, a.offset >> PAGE_SHIFT);
82out: 57out:
83 return err; 58 return err;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..8aa2057efd12 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
23 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file;
27
28 error = -EINVAL; 26 error = -EINVAL;
29 if (off & ~PAGE_MASK) 27 if (off & ~PAGE_MASK)
30 goto out; 28 goto out;
31 29
32 error = -EBADF; 30 error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
33 file = NULL;
34 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
35 if (!(flags & MAP_ANONYMOUS)) {
36 file = fget(fd);
37 if (!file)
38 goto out;
39 }
40 down_write(&current->mm->mmap_sem);
41 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
42 up_write(&current->mm->mmap_sem);
43
44 if (file)
45 fput(file);
46out: 31out:
47 return error; 32 return error;
48} 33}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 0157cd26d7cc..15228b5d3eb7 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap_pgoff
195 .long sys_truncate64 195 .long sys_truncate64
196 .long sys_ftruncate64 196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */ 197 .long sys_stat64 /* 195 */
@@ -336,3 +336,4 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index dcb00d278512..be2573448ed9 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs)
38#ifdef CONFIG_FRAME_POINTER 38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long)); 39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else 40#else
41 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp =
42 (unsigned long *)kernel_stack_pointer(regs);
42 /* 43 /*
43 * Return address is either directly at stack pointer 44 * Return address is either directly at stack pointer
44 * or above a saved flags. Eflags has bits 22-31 zero, 45 * or above a saved flags. Eflags has bits 22-31 zero,
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 503c1f2e8835..364d015efebc 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -23,8 +23,6 @@
23static struct bau_control **uv_bau_table_bases __read_mostly; 23static struct bau_control **uv_bau_table_bases __read_mostly;
24static int uv_bau_retry_limit __read_mostly; 24static int uv_bau_retry_limit __read_mostly;
25 25
26/* position of pnode (which is nasid>>1): */
27static int uv_nshift __read_mostly;
28/* base pnode in this partition */ 26/* base pnode in this partition */
29static int uv_partition_base_pnode __read_mostly; 27static int uv_partition_base_pnode __read_mostly;
30 28
@@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode)
723 BUG_ON(!adp); 721 BUG_ON(!adp);
724 722
725 pa = uv_gpa(adp); /* need the real nasid*/ 723 pa = uv_gpa(adp); /* need the real nasid*/
726 n = pa >> uv_nshift; 724 n = uv_gpa_to_pnode(pa);
727 m = pa & uv_mmask; 725 m = pa & uv_mmask;
728 726
729 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 727 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
778 * need the pnode of where the memory was really allocated 776 * need the pnode of where the memory was really allocated
779 */ 777 */
780 pa = uv_gpa(pqp); 778 pa = uv_gpa(pqp);
781 pn = pa >> uv_nshift; 779 pn = uv_gpa_to_pnode(pa);
782 uv_write_global_mmr64(pnode, 780 uv_write_global_mmr64(pnode,
783 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 781 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
784 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 782 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
@@ -819,10 +817,8 @@ static int __init uv_init_blade(int blade)
819 */ 817 */
820 apicid = blade_to_first_apicid(blade); 818 apicid = blade_to_first_apicid(blade);
821 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
822 if ((pa & 0xff) != UV_BAU_MESSAGE) { 820 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
823 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
824 ((apicid << 32) | UV_BAU_MESSAGE)); 821 ((apicid << 32) | UV_BAU_MESSAGE));
825 }
826 return 0; 822 return 0;
827} 823}
828 824
@@ -843,8 +839,7 @@ static int __init uv_bau_init(void)
843 GFP_KERNEL, cpu_to_node(cur_cpu)); 839 GFP_KERNEL, cpu_to_node(cur_cpu));
844 840
845 uv_bau_retry_limit = 1; 841 uv_bau_retry_limit = 1;
846 uv_nshift = uv_hub_info->n_val; 842 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
847 uv_mmask = (1UL << uv_hub_info->n_val) - 1;
848 nblades = uv_num_possible_blades(); 843 nblades = uv_num_possible_blades();
849 844
850 uv_bau_table_bases = (struct bau_control **) 845 uv_bau_table_bases = (struct bau_control **)
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 699f7eeb896a..c652ef62742d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,22 +3,28 @@
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
7#define __trampinit
8#define __trampinitdata
9#else
10#define __trampinit __cpuinit
11#define __trampinitdata __cpuinitdata
12#endif
13
6/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
7unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base;
8 16
9void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
10{ 18{
11#ifdef CONFIG_X86_32 19 unsigned long mem;
12 /* 20
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */ 21 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, 22 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
21 "TRAMPOLINE"); 23 if (mem == -1L)
24 panic("Cannot allocate trampoline\n");
25
26 trampoline_base = __va(mem);
27 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
22} 28}
23 29
24/* 30/*
@@ -26,7 +32,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 32 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 33 * has made sure it's suitably aligned.
28 */ 34 */
29unsigned long __cpuinit setup_trampoline(void) 35unsigned long __trampinit setup_trampoline(void)
30{ 36{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 38 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 596d54c660a5..3af2dff58b21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,8 +32,12 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP
36.section .rodata, "a", @progbits
37#else
35/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ 38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
36__CPUINITRODATA 39__CPUINITRODATA
40#endif
37.code16 41.code16
38 42
39ENTRY(trampoline_data) 43ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cd982f48e23e..597683aa5ba0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason)
763{ 763{
764 if (!tsc_unstable) { 764 if (!tsc_unstable) {
765 tsc_unstable = 1; 765 tsc_unstable = 1;
766 sched_clock_stable = 0;
766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 767 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
767 /* Change only the rating, when not registered */ 768 /* Change only the rating, when not registered */
768 if (clocksource_tsc.mult) 769 if (clocksource_tsc.mult)
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..0aa5fed8b9e6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count;
33 * we want to have the fastest, inlined, non-debug version 33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
37 37
38static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
39static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
@@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void)
62 * previous TSC that was measured (possibly on 62 * previous TSC that was measured (possibly on
63 * another CPU) and update the previous TSC timestamp. 63 * another CPU) and update the previous TSC timestamp.
64 */ 64 */
65 __raw_spin_lock(&sync_lock); 65 arch_spin_lock(&sync_lock);
66 prev = last_tsc; 66 prev = last_tsc;
67 rdtsc_barrier(); 67 rdtsc_barrier();
68 now = get_cycles(); 68 now = get_cycles();
69 rdtsc_barrier(); 69 rdtsc_barrier();
70 last_tsc = now; 70 last_tsc = now;
71 __raw_spin_unlock(&sync_lock); 71 arch_spin_unlock(&sync_lock);
72 72
73 /* 73 /*
74 * Be nice every now and then (and also check whether 74 * Be nice every now and then (and also check whether
@@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void)
87 * we saw a time-warp of the TSC going backwards: 87 * we saw a time-warp of the TSC going backwards:
88 */ 88 */
89 if (unlikely(prev > now)) { 89 if (unlikely(prev > now)) {
90 __raw_spin_lock(&sync_lock); 90 arch_spin_lock(&sync_lock);
91 max_warp = max(max_warp, prev - now); 91 max_warp = max(max_warp, prev - now);
92 nr_warps++; 92 nr_warps++;
93 __raw_spin_unlock(&sync_lock); 93 arch_spin_unlock(&sync_lock);
94 } 94 }
95 } 95 }
96 WARN(!(now-start), 96 WARN(!(now-start),
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..ece73d8e3240 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
12#include <linux/irq.h> 13#include <linux/irq.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 16#include <asm/uv/uv_irq.h>
17#include <asm/uv/uv_hub.h>
18
19/* MMR offset and pnode of hub sourcing interrupts for a given irq */
20struct uv_irq_2_mmr_pnode{
21 struct rb_node list;
22 unsigned long offset;
23 int pnode;
24 int irq;
25};
26
27static spinlock_t uv_irq_lock;
28static struct rb_root uv_irq_root;
29
30static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 31
17static void uv_noop(unsigned int irq) 32static void uv_noop(unsigned int irq)
18{ 33{
@@ -39,25 +54,213 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 54 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 55 .eoi = uv_ack_apic,
41 .end = uv_noop, 56 .end = uv_noop,
57 .set_affinity = uv_set_irq_affinity,
42}; 58};
43 59
44/* 60/*
61 * Add offset and pnode information of the hub sourcing interrupts to the
62 * rb tree for a specific irq.
63 */
64static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
65{
66 struct rb_node **link = &uv_irq_root.rb_node;
67 struct rb_node *parent = NULL;
68 struct uv_irq_2_mmr_pnode *n;
69 struct uv_irq_2_mmr_pnode *e;
70 unsigned long irqflags;
71
72 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
73 uv_blade_to_memory_nid(blade));
74 if (!n)
75 return -ENOMEM;
76
77 n->irq = irq;
78 n->offset = offset;
79 n->pnode = uv_blade_to_pnode(blade);
80 spin_lock_irqsave(&uv_irq_lock, irqflags);
81 /* Find the right place in the rbtree: */
82 while (*link) {
83 parent = *link;
84 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
85
86 if (unlikely(irq == e->irq)) {
87 /* irq entry exists */
88 e->pnode = uv_blade_to_pnode(blade);
89 e->offset = offset;
90 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
91 kfree(n);
92 return 0;
93 }
94
95 if (irq < e->irq)
96 link = &(*link)->rb_left;
97 else
98 link = &(*link)->rb_right;
99 }
100
101 /* Insert the node into the rbtree. */
102 rb_link_node(&n->list, parent, link);
103 rb_insert_color(&n->list, &uv_irq_root);
104
105 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
106 return 0;
107}
108
109/* Retrieve offset and pnode information from the rb tree for a specific irq */
110int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
111{
112 struct uv_irq_2_mmr_pnode *e;
113 struct rb_node *n;
114 unsigned long irqflags;
115
116 spin_lock_irqsave(&uv_irq_lock, irqflags);
117 n = uv_irq_root.rb_node;
118 while (n) {
119 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
120
121 if (e->irq == irq) {
122 *offset = e->offset;
123 *pnode = e->pnode;
124 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
125 return 0;
126 }
127
128 if (irq < e->irq)
129 n = n->rb_left;
130 else
131 n = n->rb_right;
132 }
133 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
134 return -1;
135}
136
137/*
138 * Re-target the irq to the specified CPU and enable the specified MMR located
139 * on the specified blade to allow the sending of MSIs to the specified CPU.
140 */
141static int
142arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 unsigned long mmr_offset, int restrict)
144{
145 const struct cpumask *eligible_cpu = cpumask_of(cpu);
146 struct irq_desc *desc = irq_to_desc(irq);
147 struct irq_cfg *cfg;
148 int mmr_pnode;
149 unsigned long mmr_value;
150 struct uv_IO_APIC_route_entry *entry;
151 int err;
152
153 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
154 sizeof(unsigned long));
155
156 cfg = irq_cfg(irq);
157
158 err = assign_irq_vector(irq, cfg, eligible_cpu);
159 if (err != 0)
160 return err;
161
162 if (restrict == UV_AFFINITY_CPU)
163 desc->status |= IRQ_NO_BALANCING;
164 else
165 desc->status |= IRQ_MOVE_PCNTXT;
166
167 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
168 irq_name);
169
170 mmr_value = 0;
171 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
172 entry->vector = cfg->vector;
173 entry->delivery_mode = apic->irq_delivery_mode;
174 entry->dest_mode = apic->irq_dest_mode;
175 entry->polarity = 0;
176 entry->trigger = 0;
177 entry->mask = 0;
178 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
179
180 mmr_pnode = uv_blade_to_pnode(mmr_blade);
181 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
182
183 if (cfg->move_in_progress)
184 send_cleanup_vector(cfg);
185
186 return irq;
187}
188
189/*
190 * Disable the specified MMR located on the specified blade so that MSIs are
191 * longer allowed to be sent.
192 */
193static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
194{
195 unsigned long mmr_value;
196 struct uv_IO_APIC_route_entry *entry;
197
198 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
199 sizeof(unsigned long));
200
201 mmr_value = 0;
202 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
203 entry->mask = 1;
204
205 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
206}
207
208static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
209{
210 struct irq_desc *desc = irq_to_desc(irq);
211 struct irq_cfg *cfg = desc->chip_data;
212 unsigned int dest;
213 unsigned long mmr_value;
214 struct uv_IO_APIC_route_entry *entry;
215 unsigned long mmr_offset;
216 unsigned mmr_pnode;
217
218 if (set_desc_affinity(desc, mask, &dest))
219 return -1;
220
221 mmr_value = 0;
222 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
223
224 entry->vector = cfg->vector;
225 entry->delivery_mode = apic->irq_delivery_mode;
226 entry->dest_mode = apic->irq_dest_mode;
227 entry->polarity = 0;
228 entry->trigger = 0;
229 entry->mask = 0;
230 entry->dest = dest;
231
232 /* Get previously stored MMR and pnode of hub sourcing interrupts */
233 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
234 return -1;
235
236 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
237
238 if (cfg->move_in_progress)
239 send_cleanup_vector(cfg);
240
241 return 0;
242}
243
244/*
45 * Set up a mapping of an available irq and vector, and enable the specified 245 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 246 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 247 * interrupt is raised.
48 */ 248 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 249int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 250 unsigned long mmr_offset, int restrict)
51{ 251{
52 int irq; 252 int irq, ret;
53 int ret; 253
254 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 255
55 irq = create_irq();
56 if (irq <= 0) 256 if (irq <= 0)
57 return -EBUSY; 257 return -EBUSY;
58 258
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 259 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 260 restrict);
261 if (ret == irq)
262 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
263 else
61 destroy_irq(irq); 264 destroy_irq(irq);
62 265
63 return ret; 266 return ret;
@@ -71,9 +274,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 274 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 275 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 276 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 277void uv_teardown_irq(unsigned int irq)
75{ 278{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 279 struct uv_irq_2_mmr_pnode *e;
280 struct rb_node *n;
281 unsigned long irqflags;
282
283 spin_lock_irqsave(&uv_irq_lock, irqflags);
284 n = uv_irq_root.rb_node;
285 while (n) {
286 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
287 if (e->irq == irq) {
288 arch_disable_uv_irq(e->pnode, e->offset);
289 rb_erase(n, &uv_irq_root);
290 kfree(e);
291 break;
292 }
293 if (irq < e->irq)
294 n = n->rb_left;
295 else
296 n = n->rb_right;
297 }
298 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 299 destroy_irq(irq);
78} 300}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 301EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..3c84aa001c11 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -74,7 +74,7 @@ struct uv_rtc_timer_head {
74 */ 74 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 75static struct uv_rtc_timer_head **blade_info __read_mostly;
76 76
77static int uv_rtc_enable; 77static int uv_rtc_evt_enable;
78 78
79/* 79/*
80 * Hardware interface routines 80 * Hardware interface routines
@@ -90,7 +90,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 91 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 93 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 94
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 96}
@@ -115,7 +115,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 116 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 117
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 118 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 120
121 /* Set configuration */ 121 /* Set configuration */
@@ -123,7 +123,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 123 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 125
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 126 if (uv_read_rtc(NULL) <= expires)
127 return 0;
128
129 return !uv_intr_pending(pnode);
127} 130}
128 131
129/* 132/*
@@ -223,6 +226,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 226
224 next_cpu = head->next_cpu; 227 next_cpu = head->next_cpu;
225 *t = expires; 228 *t = expires;
229
226 /* Will this one be next to go off? */ 230 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 231 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 232 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +235,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 235 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 236 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 237 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 238 return -ETIME;
235 } 239 }
236 } 240 }
237 241
@@ -244,7 +248,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 248 *
245 * Returns 1 if this timer was pending. 249 * Returns 1 if this timer was pending.
246 */ 250 */
247static int uv_rtc_unset_timer(int cpu) 251static int uv_rtc_unset_timer(int cpu, int force)
248{ 252{
249 int pnode = uv_cpu_to_pnode(cpu); 253 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 254 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +260,15 @@ static int uv_rtc_unset_timer(int cpu)
256 260
257 spin_lock_irqsave(&head->lock, flags); 261 spin_lock_irqsave(&head->lock, flags);
258 262
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 263 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 264 rc = 1;
261 265
262 *t = ULLONG_MAX; 266 if (rc) {
263 267 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 268 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 269 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 270 uv_rtc_find_next_timer(head, pnode);
271 }
267 272
268 spin_unlock_irqrestore(&head->lock, flags); 273 spin_unlock_irqrestore(&head->lock, flags);
269 274
@@ -310,32 +315,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 315 break;
311 case CLOCK_EVT_MODE_UNUSED: 316 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 317 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 318 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 319 break;
315 } 320 }
316} 321}
317 322
318static void uv_rtc_interrupt(void) 323static void uv_rtc_interrupt(void)
319{ 324{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 325 int cpu = smp_processor_id();
326 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 327
323 if (!ced || !ced->event_handler) 328 if (!ced || !ced->event_handler)
324 return; 329 return;
325 330
326 if (uv_rtc_unset_timer(cpu) != 1) 331 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 332 return;
328 333
329 ced->event_handler(ced); 334 ced->event_handler(ced);
330} 335}
331 336
332static int __init uv_enable_rtc(char *str) 337static int __init uv_enable_evt_rtc(char *str)
333{ 338{
334 uv_rtc_enable = 1; 339 uv_rtc_evt_enable = 1;
335 340
336 return 1; 341 return 1;
337} 342}
338__setup("uvrtc", uv_enable_rtc); 343__setup("uvrtcevt", uv_enable_evt_rtc);
339 344
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 345static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 346{
@@ -350,27 +355,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 355{
351 int rc; 356 int rc;
352 357
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 358 if (!is_uv_system())
354 return -ENODEV; 359 return -ENODEV;
355 360
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 361 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 362 clocksource_uv.shift);
360 363
364 /* If single blade, prefer tsc */
365 if (uv_num_possible_blades() == 1)
366 clocksource_uv.rating = 250;
367
361 rc = clocksource_register(&clocksource_uv); 368 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 369 if (rc)
363 generic_interrupt_extension = NULL; 370 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
371 else
372 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
373 sn_rtc_cycles_per_second/(unsigned long)1E6);
374
375 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 376 return rc;
365 }
366 377
367 /* Setup and register clockevents */ 378 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 379 rc = uv_rtc_allocate_timers();
369 if (rc) { 380 if (rc)
370 clocksource_unregister(&clocksource_uv); 381 goto error;
371 generic_interrupt_extension = NULL; 382
372 return rc; 383 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 384
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 385 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 386 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +393,19 @@ static __init int uv_rtc_setup_clock(void)
383 393
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 394 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 395 if (rc) {
386 clocksource_unregister(&clocksource_uv); 396 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 397 uv_rtc_deallocate_timers();
398 goto error;
389 } 399 }
390 400
401 printk(KERN_INFO "UV RTC clockevents registered\n");
402
403 return 0;
404
405error:
406 clocksource_unregister(&clocksource_uv);
407 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
408
391 return rc; 409 return rc;
392} 410}
393arch_initcall(uv_rtc_setup_clock); 411arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553a1b17..34a279a7471d 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 183 return;
184 } 184 }
185 185
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 186 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 188 /*
189 * Validate version 189 * Validate version
@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
198} 198}
199 199
200static void __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(void)
201{ 201{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 486}
487 487
488static struct irq_chip cobalt_irq_type = { 488static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 489 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 490 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 491 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 492 .enable = enable_cobalt_irq,
@@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 523}
524 524
525static struct irq_chip piix4_master_irq_type = { 525static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 526 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 527 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 528 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 529 .end = end_piix4_master_irq,
@@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = {
531 531
532 532
533static struct irq_chip piix4_virtual_irq_type = { 533static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 534 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq, 535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq, 536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq, 537 .disable = disable_8259A_irq,
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..5ffb5622f793 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -197,9 +197,8 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200int sys_vm86old(struct pt_regs *regs) 200int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 202 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 203 * this avoids wasting of stack space.
205 * This remains on the stack until we 204 * This remains on the stack until we
@@ -227,7 +226,7 @@ out:
227} 226}
228 227
229 228
230int sys_vm86(struct pt_regs *regs) 229int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
231{ 230{
232 struct kernel_vm86_struct info; /* declare this _on top_, 231 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 232 * this avoids wasting of stack space.
@@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs)
239 struct vm86plus_struct __user *v86; 238 struct vm86plus_struct __user *v86;
240 239
241 tsk = current; 240 tsk = current;
242 switch (regs->bx) { 241 switch (cmd) {
243 case VM86_REQUEST_IRQ: 242 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 243 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 244 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 245 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); 246 ret = do_vm86_irq_handling(cmd, (int)arg);
248 goto out; 247 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 248 case VM86_PLUS_INSTALL_CHECK:
250 /* 249 /*
@@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs)
261 ret = -EPERM; 260 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 261 if (tsk->thread.saved_sp0)
263 goto out; 262 goto out;
264 v86 = (struct vm86plus_struct __user *)regs->cx; 263 v86 = (struct vm86plus_struct __user *)arg;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 265 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 266 sizeof(info.regs));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 31e6f6cfe53e..d430e4c30193 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
648 648
649 pv_info.paravirt_enabled = 1; 649 pv_info.paravirt_enabled = 1;
650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
651 pv_info.name = "vmi"; 651 pv_info.name = "vmi [deprecated]";
652 652
653 pv_init_ops.patch = vmi_patch; 653 pv_init_ops.patch = vmi_patch;
654 654
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..74c92bb194df 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 226 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 227 evt->cpumask = cpumask_of(cpu);
228 228
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 230 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 231 clockevents_register_device(evt);
232} 232}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ecb92717c412..44879df55696 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
@@ -291,9 +319,7 @@ SECTIONS
291 __brk_limit = .; 319 __brk_limit = .;
292 } 320 }
293 321
294 .end : AT(ADDR(.end) - LOAD_OFFSET) { 322 _end = .;
295 _end = .;
296 }
297 323
298 STABS_DEBUG 324 STABS_DEBUG
299 DWARF_DEBUG 325 DWARF_DEBUG
@@ -305,6 +331,9 @@ SECTIONS
305 331
306 332
307#ifdef CONFIG_X86_32 333#ifdef CONFIG_X86_32
334/*
335 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
336 */
308. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 337. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
309 "kernel image bigger than KERNEL_IMAGE_SIZE"); 338 "kernel image bigger than KERNEL_IMAGE_SIZE");
310#else 339#else
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..9055e5872ff0 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..619f7f88b8cc 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -17,8 +17,6 @@
17EXPORT_SYMBOL(mcount); 17EXPORT_SYMBOL(mcount);
18#endif 18#endif
19 19
20EXPORT_SYMBOL(kernel_thread);
21
22EXPORT_SYMBOL(__get_user_1); 20EXPORT_SYMBOL(__get_user_1);
23EXPORT_SYMBOL(__get_user_2); 21EXPORT_SYMBOL(__get_user_2);
24EXPORT_SYMBOL(__get_user_4); 22EXPORT_SYMBOL(__get_user_4);
@@ -30,9 +28,8 @@ EXPORT_SYMBOL(__put_user_8);
30 28
31EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic);
32EXPORT_SYMBOL(__copy_user_nocache); 30EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 31EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 32EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 33
37EXPORT_SYMBOL(copy_page); 34EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 35EXPORT_SYMBOL(clear_page);
@@ -57,4 +54,6 @@ EXPORT_SYMBOL(__memcpy);
57 54
58EXPORT_SYMBOL(empty_zero_page); 55EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 56EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 57#ifndef CONFIG_PARAVIRT
58EXPORT_SYMBOL(native_load_gs_index);
59#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a2c2ed..ccd179dec36e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -13,11 +13,15 @@
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/pat.h>
16#include <asm/tsc.h> 17#include <asm/tsc.h>
18#include <asm/iommu.h>
17 19
18void __cpuinit x86_init_noop(void) { } 20void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { } 21void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { } 22void __init x86_init_pgd_noop(pgd_t *unused) { }
23int __init iommu_init_noop(void) { return 0; }
24void iommu_shutdown_noop(void) { }
21 25
22/* 26/*
23 * The platform setup functions are preset with the default functions 27 * The platform setup functions are preset with the default functions
@@ -62,6 +66,10 @@ struct x86_init_ops x86_init __initdata = {
62 .tsc_pre_init = x86_init_noop, 66 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init, 67 .timer_init = hpet_time_init,
64 }, 68 },
69
70 .iommu = {
71 .iommu_init = iommu_init_noop,
72 },
65}; 73};
66 74
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -72,4 +80,6 @@ struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc, 80 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time, 81 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss, 82 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range,
75}; 85};