aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2012-03-19 20:02:01 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2012-03-19 20:02:01 -0400
commit10ce3cc919f50c2043b41ca968b43c26a3672600 (patch)
treeea409366a5208aced495bc0516a08b81fd43222e /arch/x86/kernel
parent24e3e5ae1e4c2a3a32f5b1f96b4e3fd721806acd (diff)
parent5c6a7a62c130afef3d61c1dee153012231ff5cd9 (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c10
-rw-r--r--arch/x86/kernel/amd_nb.c39
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c113
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c294
-rw-r--r--arch/x86/kernel/apic/io_apic.c6
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c9
-rw-r--r--arch/x86/kernel/apm_32.c16
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/asm-offsets_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_64.c19
-rw-r--r--arch/x86/kernel/check.c34
-rw-r--r--arch/x86/kernel/cpu/amd.c17
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c43
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c69
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c204
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c24
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c278
-rw-r--r--arch/x86/kernel/cpu/perf_event.h59
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c39
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c7
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c2
-rw-r--r--arch/x86/kernel/cpu/powerflags.c3
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/dumpstack.c3
-rw-r--r--arch/x86/kernel/dumpstack_32.c8
-rw-r--r--arch/x86/kernel/dumpstack_64.c16
-rw-r--r--arch/x86/kernel/e820.c121
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/entry_32.S51
-rw-r--r--arch/x86/kernel/entry_64.S270
-rw-r--r--arch/x86/kernel/head.c2
-rw-r--r--arch/x86/kernel/head32.c7
-rw-r--r--arch/x86/kernel/head64.c7
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c29
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irq_64.c38
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/kvm.c181
-rw-r--r--arch/x86/kernel/microcode_amd.c234
-rw-r--r--arch/x86/kernel/microcode_core.c91
-rw-r--r--arch/x86/kernel/mpparse.c14
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c102
-rw-r--r--arch/x86/kernel/nmi_selftest.c180
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kernel/process_32.c32
-rw-r--r--arch/x86/kernel/process_64.c45
-rw-r--r--arch/x86/kernel/ptrace.c28
-rw-r--r--arch/x86/kernel/quirks.c13
-rw-r--r--arch/x86/kernel/reboot.c57
-rw-r--r--arch/x86/kernel/rtc.c5
-rw-r--r--arch/x86/kernel/setup.c28
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c72
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--arch/x86/kernel/syscall_32.c25
-rw-r--r--arch/x86/kernel/syscall_64.c20
-rw-r--r--arch/x86/kernel/syscall_table_32.S350
-rw-r--r--arch/x86/kernel/trampoline.c4
-rw-r--r--arch/x86/kernel/traps.c70
-rw-r--r--arch/x86/kernel/tsc.c40
-rw-r--r--arch/x86/kernel/tsc_sync.c4
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vsyscall_64.c77
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kernel/xsave.c12
86 files changed, 2462 insertions, 1448 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8baca3c4871c..5369059c07a9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
25obj-y += probe_roms.o 25obj-y += probe_roms.o
26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
28obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 28obj-y += syscall_$(BITS).o
29obj-$(CONFIG_X86_64) += vsyscall_64.o
29obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 30obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
30obj-y += bootflag.o e820.o 31obj-y += bootflag.o e820.o
31obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 32obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
@@ -80,6 +81,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
80obj-$(CONFIG_AMD_NB) += amd_nb.o 81obj-$(CONFIG_AMD_NB) += amd_nb.o
81obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 82obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
82obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 83obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
84obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 85
84obj-$(CONFIG_KVM_GUEST) += kvm.o 86obj-$(CONFIG_KVM_GUEST) += kvm.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 87obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d0822d..ce664f33ea8e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -219,6 +219,8 @@ static int __init
219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) 219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
220{ 220{
221 struct acpi_madt_local_x2apic *processor = NULL; 221 struct acpi_madt_local_x2apic *processor = NULL;
222 int apic_id;
223 u8 enabled;
222 224
223 processor = (struct acpi_madt_local_x2apic *)header; 225 processor = (struct acpi_madt_local_x2apic *)header;
224 226
@@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
227 229
228 acpi_table_print_madt_entry(header); 230 acpi_table_print_madt_entry(header);
229 231
232 apic_id = processor->local_apic_id;
233 enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
230#ifdef CONFIG_X86_X2APIC 234#ifdef CONFIG_X86_X2APIC
231 /* 235 /*
232 * We need to register disabled CPU as well to permit 236 * We need to register disabled CPU as well to permit
@@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
235 * to not preallocating memory for all NR_CPUS 239 * to not preallocating memory for all NR_CPUS
236 * when we use CPU hotplug. 240 * when we use CPU hotplug.
237 */ 241 */
238 acpi_register_lapic(processor->local_apic_id, /* APIC ID */ 242 if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
239 processor->lapic_flags & ACPI_MADT_ENABLED); 243 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
244 else
245 acpi_register_lapic(apic_id, enabled);
240#else 246#else
241 printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); 247 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
242#endif 248#endif
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8facc..be16854591cc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,20 +119,49 @@ bool __init early_is_amd_nb(u32 device)
119 return false; 119 return false;
120} 120}
121 121
122struct resource *amd_get_mmconfig_range(struct resource *res)
123{
124 u32 address;
125 u64 base, msr;
126 unsigned segn_busn_bits;
127
128 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
129 return NULL;
130
131 /* assume all cpus from fam10h have mmconfig */
132 if (boot_cpu_data.x86 < 0x10)
133 return NULL;
134
135 address = MSR_FAM10H_MMIO_CONF_BASE;
136 rdmsrl(address, msr);
137
138 /* mmconfig is not enabled */
139 if (!(msr & FAM10H_MMIO_CONF_ENABLE))
140 return NULL;
141
142 base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
143
144 segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
145 FAM10H_MMIO_CONF_BUSRANGE_MASK;
146
147 res->flags = IORESOURCE_MEM;
148 res->start = base;
149 res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
150 return res;
151}
152
122int amd_get_subcaches(int cpu) 153int amd_get_subcaches(int cpu)
123{ 154{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; 155 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask; 156 unsigned int mask;
126 int cuid = 0; 157 int cuid;
127 158
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 159 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0; 160 return 0;
130 161
131 pci_read_config_dword(link, 0x1d4, &mask); 162 pci_read_config_dword(link, 0x1d4, &mask);
132 163
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id; 164 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf; 165 return (mask >> (4 * cuid)) & 0xf;
137} 166}
138 167
@@ -141,7 +170,7 @@ int amd_set_subcaches(int cpu, int mask)
141 static unsigned int reset, ban; 170 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); 171 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg; 172 unsigned int reg;
144 int cuid = 0; 173 int cuid;
145 174
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) 175 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL; 176 return -EINVAL;
@@ -159,9 +188,7 @@ int amd_set_subcaches(int cpu, int mask)
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); 188 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 } 189 }
161 190
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id; 191 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid; 192 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26; 193 mask |= (0xf ^ (1 << cuid)) << 26;
167 194
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3d2661ca6542..6e76c191a835 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void)
88 */ 88 */
89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, 89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
90 aper_size, aper_size); 90 aper_size, aper_size);
91 if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { 91 if (!addr || addr + aper_size > GART_MAX_ADDR) {
92 printk(KERN_ERR 92 printk(KERN_ERR
93 "Cannot allocate aperture memory hole (%lx,%uK)\n", 93 "Cannot allocate aperture memory hole (%lx,%uK)\n",
94 addr, aper_size>>10); 94 addr, aper_size>>10);
95 return 0; 95 return 0;
96 } 96 }
97 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); 97 memblock_reserve(addr, aper_size);
98 /* 98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the 99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping. 100 * kernel direct mapping.
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 767fd04f2843..0ae0323b1f9c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o
10 10
11ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
12# APIC probe will depend on the listing order here 12# APIC probe will depend on the listing order here
13obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 14obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o 15obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o 16obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f98d84caf94c..2eec05b6d1b8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);
146int x2apic_mode; 146int x2apic_mode;
147#ifdef CONFIG_X86_X2APIC 147#ifdef CONFIG_X86_X2APIC
148/* x2apic enabled before OS handover */ 148/* x2apic enabled before OS handover */
149static int x2apic_preenabled; 149int x2apic_preenabled;
150static int x2apic_disabled;
151static int nox2apic;
150static __init int setup_nox2apic(char *str) 152static __init int setup_nox2apic(char *str)
151{ 153{
152 if (x2apic_enabled()) { 154 if (x2apic_enabled()) {
153 pr_warning("Bios already enabled x2apic, " 155 int apicid = native_apic_msr_read(APIC_ID);
154 "can't enforce nox2apic"); 156
155 return 0; 157 if (apicid >= 255) {
156 } 158 pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
159 apicid);
160 return 0;
161 }
162
163 pr_warning("x2apic already enabled. will disable it\n");
164 } else
165 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
166
167 nox2apic = 1;
157 168
158 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
159 return 0; 169 return 0;
160} 170}
161early_param("nox2apic", setup_nox2apic); 171early_param("nox2apic", setup_nox2apic);
@@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)
250 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 260 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
251 if (!send_status) 261 if (!send_status)
252 break; 262 break;
263 inc_irq_stat(icr_read_retry_count);
253 udelay(100); 264 udelay(100);
254 } while (timeout++ < 1000); 265 } while (timeout++ < 1000);
255 266
@@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
876 * Besides, if we don't timer interrupts ignore the global 887 * Besides, if we don't timer interrupts ignore the global
877 * interrupt lock, which is the WrongThing (tm) to do. 888 * interrupt lock, which is the WrongThing (tm) to do.
878 */ 889 */
879 exit_idle();
880 irq_enter(); 890 irq_enter();
891 exit_idle();
881 local_apic_timer_interrupt(); 892 local_apic_timer_interrupt();
882 irq_exit(); 893 irq_exit();
883 894
@@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)
1431} 1442}
1432 1443
1433#ifdef CONFIG_X86_X2APIC 1444#ifdef CONFIG_X86_X2APIC
1445/*
1446 * Need to disable xapic and x2apic at the same time and then enable xapic mode
1447 */
1448static inline void __disable_x2apic(u64 msr)
1449{
1450 wrmsrl(MSR_IA32_APICBASE,
1451 msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
1452 wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
1453}
1454
1455static __init void disable_x2apic(void)
1456{
1457 u64 msr;
1458
1459 if (!cpu_has_x2apic)
1460 return;
1461
1462 rdmsrl(MSR_IA32_APICBASE, msr);
1463 if (msr & X2APIC_ENABLE) {
1464 u32 x2apic_id = read_apic_id();
1465
1466 if (x2apic_id >= 255)
1467 panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
1468
1469 pr_info("Disabling x2apic\n");
1470 __disable_x2apic(msr);
1471
1472 if (nox2apic) {
1473 clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
1474 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
1475 }
1476
1477 x2apic_disabled = 1;
1478 x2apic_mode = 0;
1479
1480 register_lapic_address(mp_lapic_addr);
1481 }
1482}
1483
1434void check_x2apic(void) 1484void check_x2apic(void)
1435{ 1485{
1436 if (x2apic_enabled()) { 1486 if (x2apic_enabled()) {
@@ -1441,15 +1491,20 @@ void check_x2apic(void)
1441 1491
1442void enable_x2apic(void) 1492void enable_x2apic(void)
1443{ 1493{
1444 int msr, msr2; 1494 u64 msr;
1495
1496 rdmsrl(MSR_IA32_APICBASE, msr);
1497 if (x2apic_disabled) {
1498 __disable_x2apic(msr);
1499 return;
1500 }
1445 1501
1446 if (!x2apic_mode) 1502 if (!x2apic_mode)
1447 return; 1503 return;
1448 1504
1449 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1450 if (!(msr & X2APIC_ENABLE)) { 1505 if (!(msr & X2APIC_ENABLE)) {
1451 printk_once(KERN_INFO "Enabling x2apic\n"); 1506 printk_once(KERN_INFO "Enabling x2apic\n");
1452 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); 1507 wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
1453 } 1508 }
1454} 1509}
1455#endif /* CONFIG_X86_X2APIC */ 1510#endif /* CONFIG_X86_X2APIC */
@@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void)
1486 ret = save_ioapic_entries(); 1541 ret = save_ioapic_entries();
1487 if (ret) { 1542 if (ret) {
1488 pr_info("Saving IO-APIC state failed: %d\n", ret); 1543 pr_info("Saving IO-APIC state failed: %d\n", ret);
1489 goto out; 1544 return;
1490 } 1545 }
1491 1546
1492 local_irq_save(flags); 1547 local_irq_save(flags);
1493 legacy_pic->mask_all(); 1548 legacy_pic->mask_all();
1494 mask_ioapic_entries(); 1549 mask_ioapic_entries();
1495 1550
1551 if (x2apic_preenabled && nox2apic)
1552 disable_x2apic();
1553
1496 if (dmar_table_init_ret) 1554 if (dmar_table_init_ret)
1497 ret = -1; 1555 ret = -1;
1498 else 1556 else
1499 ret = enable_IR(); 1557 ret = enable_IR();
1500 1558
1559 if (!x2apic_supported())
1560 goto skip_x2apic;
1561
1501 if (ret < 0) { 1562 if (ret < 0) {
1502 /* IR is required if there is APIC ID > 255 even when running 1563 /* IR is required if there is APIC ID > 255 even when running
1503 * under KVM 1564 * under KVM
1504 */ 1565 */
1505 if (max_physical_apicid > 255 || 1566 if (max_physical_apicid > 255 ||
1506 !hypervisor_x2apic_available()) 1567 !hypervisor_x2apic_available()) {
1507 goto nox2apic; 1568 if (x2apic_preenabled)
1569 disable_x2apic();
1570 goto skip_x2apic;
1571 }
1508 /* 1572 /*
1509 * without IR all CPUs can be addressed by IOAPIC/MSI 1573 * without IR all CPUs can be addressed by IOAPIC/MSI
1510 * only in physical mode 1574 * only in physical mode
@@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void)
1512 x2apic_force_phys(); 1576 x2apic_force_phys();
1513 } 1577 }
1514 1578
1515 if (ret == IRQ_REMAP_XAPIC_MODE) 1579 if (ret == IRQ_REMAP_XAPIC_MODE) {
1516 goto nox2apic; 1580 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1581 goto skip_x2apic;
1582 }
1517 1583
1518 x2apic_enabled = 1; 1584 x2apic_enabled = 1;
1519 1585
@@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void)
1523 pr_info("Enabled x2apic\n"); 1589 pr_info("Enabled x2apic\n");
1524 } 1590 }
1525 1591
1526nox2apic: 1592skip_x2apic:
1527 if (ret < 0) /* IR enabling failed */ 1593 if (ret < 0) /* IR enabling failed */
1528 restore_ioapic_entries(); 1594 restore_ioapic_entries();
1529 legacy_pic->restore_mask(); 1595 legacy_pic->restore_mask();
1530 local_irq_restore(flags); 1596 local_irq_restore(flags);
1531
1532out:
1533 if (x2apic_enabled || !x2apic_supported())
1534 return;
1535
1536 if (x2apic_preenabled)
1537 panic("x2apic: enabled by BIOS but kernel init failed.");
1538 else if (ret == IRQ_REMAP_XAPIC_MODE)
1539 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1540 else if (ret < 0)
1541 pr_info("x2apic not enabled, IRQ remapping init failed\n");
1542} 1597}
1543 1598
1544#ifdef CONFIG_X86_64 1599#ifdef CONFIG_X86_64
@@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1809{ 1864{
1810 u32 v; 1865 u32 v;
1811 1866
1812 exit_idle();
1813 irq_enter(); 1867 irq_enter();
1868 exit_idle();
1814 /* 1869 /*
1815 * Check if this really is a spurious interrupt and ACK it 1870 * Check if this really is a spurious interrupt and ACK it
1816 * if it is a vectored one. Just in case... 1871 * if it is a vectored one. Just in case...
@@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)
1846 "Illegal register address", /* APIC Error Bit 7 */ 1901 "Illegal register address", /* APIC Error Bit 7 */
1847 }; 1902 };
1848 1903
1849 exit_idle();
1850 irq_enter(); 1904 irq_enter();
1905 exit_idle();
1851 /* First tickle the hardware, only then report what went on. -- REW */ 1906 /* First tickle the hardware, only then report what went on. -- REW */
1852 v0 = apic_read(APIC_ESR); 1907 v0 = apic_read(APIC_ESR);
1853 apic_write(APIC_ESR, 0); 1908 apic_write(APIC_ESR, 0);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f7a41e4cae47..8c3cdded6f2b 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
62 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 62 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
63 * document number 292116). So here it goes... 63 * document number 292116). So here it goes...
64 */ 64 */
65static void flat_init_apic_ldr(void) 65void flat_init_apic_ldr(void)
66{ 66{
67 unsigned long val; 67 unsigned long val;
68 unsigned long num, id; 68 unsigned long num, id;
@@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
171 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
172} 172}
173 173
174static int flat_probe(void)
175{
176 return 1;
177}
178
174static struct apic apic_flat = { 179static struct apic apic_flat = {
175 .name = "flat", 180 .name = "flat",
176 .probe = NULL, 181 .probe = flat_probe,
177 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 182 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
178 .apic_id_registered = flat_apic_id_registered, 183 .apic_id_registered = flat_apic_id_registered,
179 184
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000000..09d3d8c1cd99
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,294 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Numascale NumaConnect-Specific APIC Code
7 *
8 * Copyright (C) 2011 Numascale AS. All rights reserved.
9 *
10 * Send feedback to <support@numascale.com>
11 *
12 */
13
14#include <linux/errno.h>
15#include <linux/threads.h>
16#include <linux/cpumask.h>
17#include <linux/string.h>
18#include <linux/kernel.h>
19#include <linux/module.h>
20#include <linux/ctype.h>
21#include <linux/init.h>
22#include <linux/hardirq.h>
23#include <linux/delay.h>
24
25#include <asm/numachip/numachip_csr.h>
26#include <asm/smp.h>
27#include <asm/apic.h>
28#include <asm/ipi.h>
29#include <asm/apic_flat_64.h>
30
31static int numachip_system __read_mostly;
32
33static struct apic apic_numachip __read_mostly;
34
35static unsigned int get_apic_id(unsigned long x)
36{
37 unsigned long value;
38 unsigned int id;
39
40 rdmsrl(MSR_FAM10H_NODE_ID, value);
41 id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
42
43 return id;
44}
45
46static unsigned long set_apic_id(unsigned int id)
47{
48 unsigned long x;
49
50 x = ((id & 0xffU) << 24);
51 return x;
52}
53
54static unsigned int read_xapic_id(void)
55{
56 return get_apic_id(apic_read(APIC_ID));
57}
58
59static int numachip_apic_id_registered(void)
60{
61 return physid_isset(read_xapic_id(), phys_cpu_present_map);
62}
63
64static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
65{
66 return initial_apic_id >> index_msb;
67}
68
69static const struct cpumask *numachip_target_cpus(void)
70{
71 return cpu_online_mask;
72}
73
74static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
75{
76 cpumask_clear(retmask);
77 cpumask_set_cpu(cpu, retmask);
78}
79
80static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
81{
82 union numachip_csr_g3_ext_irq_gen int_gen;
83
84 int_gen.s._destination_apic_id = phys_apicid;
85 int_gen.s._vector = 0;
86 int_gen.s._msgtype = APIC_DM_INIT >> 8;
87 int_gen.s._index = 0;
88
89 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
90
91 int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
92 int_gen.s._vector = start_rip >> 12;
93
94 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
95
96 atomic_set(&init_deasserted, 1);
97 return 0;
98}
99
100static void numachip_send_IPI_one(int cpu, int vector)
101{
102 union numachip_csr_g3_ext_irq_gen int_gen;
103 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
104
105 int_gen.s._destination_apic_id = apicid;
106 int_gen.s._vector = vector;
107 int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
108 int_gen.s._index = 0;
109
110 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
111}
112
113static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
114{
115 unsigned int cpu;
116
117 for_each_cpu(cpu, mask)
118 numachip_send_IPI_one(cpu, vector);
119}
120
121static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
122 int vector)
123{
124 unsigned int this_cpu = smp_processor_id();
125 unsigned int cpu;
126
127 for_each_cpu(cpu, mask) {
128 if (cpu != this_cpu)
129 numachip_send_IPI_one(cpu, vector);
130 }
131}
132
133static void numachip_send_IPI_allbutself(int vector)
134{
135 unsigned int this_cpu = smp_processor_id();
136 unsigned int cpu;
137
138 for_each_online_cpu(cpu) {
139 if (cpu != this_cpu)
140 numachip_send_IPI_one(cpu, vector);
141 }
142}
143
144static void numachip_send_IPI_all(int vector)
145{
146 numachip_send_IPI_mask(cpu_online_mask, vector);
147}
148
149static void numachip_send_IPI_self(int vector)
150{
151 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
152}
153
154static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
155{
156 int cpu;
157
158 /*
159 * We're using fixed IRQ delivery, can only return one phys APIC ID.
160 * May as well be the first.
161 */
162 cpu = cpumask_first(cpumask);
163 if (likely((unsigned)cpu < nr_cpu_ids))
164 return per_cpu(x86_cpu_to_apicid, cpu);
165
166 return BAD_APICID;
167}
168
169static unsigned int
170numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
171 const struct cpumask *andmask)
172{
173 int cpu;
174
175 /*
176 * We're using fixed IRQ delivery, can only return one phys APIC ID.
177 * May as well be the first.
178 */
179 for_each_cpu_and(cpu, cpumask, andmask) {
180 if (cpumask_test_cpu(cpu, cpu_online_mask))
181 break;
182 }
183 return per_cpu(x86_cpu_to_apicid, cpu);
184}
185
186static int __init numachip_probe(void)
187{
188 return apic == &apic_numachip;
189}
190
191static void __init map_csrs(void)
192{
193 printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
194 NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
195 init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
196
197 printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
198 NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
199 init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
200}
201
202static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
203{
204 c->phys_proc_id = node;
205 per_cpu(cpu_llc_id, smp_processor_id()) = node;
206}
207
208static int __init numachip_system_init(void)
209{
210 unsigned int val;
211
212 if (!numachip_system)
213 return 0;
214
215 x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
216
217 map_csrs();
218
219 val = read_lcsr(CSR_G0_NODE_IDS);
220 printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
221
222 return 0;
223}
224early_initcall(numachip_system_init);
225
226static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
227{
228 if (!strncmp(oem_id, "NUMASC", 6)) {
229 numachip_system = 1;
230 return 1;
231 }
232
233 return 0;
234}
235
236static struct apic apic_numachip __refconst = {
237
238 .name = "NumaConnect system",
239 .probe = numachip_probe,
240 .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
241 .apic_id_registered = numachip_apic_id_registered,
242
243 .irq_delivery_mode = dest_Fixed,
244 .irq_dest_mode = 0, /* physical */
245
246 .target_cpus = numachip_target_cpus,
247 .disable_esr = 0,
248 .dest_logical = 0,
249 .check_apicid_used = NULL,
250 .check_apicid_present = NULL,
251
252 .vector_allocation_domain = numachip_vector_allocation_domain,
253 .init_apic_ldr = flat_init_apic_ldr,
254
255 .ioapic_phys_id_map = NULL,
256 .setup_apic_routing = NULL,
257 .multi_timer_check = NULL,
258 .cpu_present_to_apicid = default_cpu_present_to_apicid,
259 .apicid_to_cpu_present = NULL,
260 .setup_portio_remap = NULL,
261 .check_phys_apicid_present = default_check_phys_apicid_present,
262 .enable_apic_mode = NULL,
263 .phys_pkg_id = numachip_phys_pkg_id,
264 .mps_oem_check = NULL,
265
266 .get_apic_id = get_apic_id,
267 .set_apic_id = set_apic_id,
268 .apic_id_mask = 0xffU << 24,
269
270 .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid,
271 .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
272
273 .send_IPI_mask = numachip_send_IPI_mask,
274 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
275 .send_IPI_allbutself = numachip_send_IPI_allbutself,
276 .send_IPI_all = numachip_send_IPI_all,
277 .send_IPI_self = numachip_send_IPI_self,
278
279 .wakeup_secondary_cpu = numachip_wakeup_secondary,
280 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
281 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
282 .wait_for_init_deassert = NULL,
283 .smp_callin_clear_local_apic = NULL,
284 .inquire_remote_apic = NULL, /* REMRD not supported */
285
286 .read = native_apic_mem_read,
287 .write = native_apic_mem_write,
288 .icr_read = native_apic_icr_read,
289 .icr_write = native_apic_icr_write,
290 .wait_icr_idle = native_apic_wait_icr_idle,
291 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
292};
293apic_driver(apic_numachip);
294
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6d939d7847e2..fb072754bc1d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2421 unsigned vector, me; 2421 unsigned vector, me;
2422 2422
2423 ack_APIC_irq(); 2423 ack_APIC_irq();
2424 exit_idle();
2425 irq_enter(); 2424 irq_enter();
2425 exit_idle();
2426 2426
2427 me = smp_processor_id(); 2427 me = smp_processor_id();
2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -2948,6 +2948,10 @@ static inline void __init check_timer(void)
2948 } 2948 }
2949 local_irq_disable(); 2949 local_irq_disable();
2950 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 2950 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2951 if (x2apic_preenabled)
2952 apic_printk(APIC_QUIET, KERN_INFO
2953 "Perhaps problem with the pre-enabled x2apic mode\n"
2954 "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
2951 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2955 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2952 "report. Then try booting with the 'noapic' option.\n"); 2956 "report. Then try booting with the 'noapic' option.\n");
2953out: 2957out:
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 62ae3001ae02..79b05b88aa19 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -93,6 +93,8 @@ static int __init early_get_pnodeid(void)
93 93
94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER) 94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; 95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96 if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X)
97 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96 98
97 uv_hub_info->hub_revision = uv_min_hub_revision_id; 99 uv_hub_info->hub_revision = uv_min_hub_revision_id;
98 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); 100 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
@@ -767,7 +769,12 @@ void __init uv_system_init(void)
767 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 769 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
768 uv_possible_blades += 770 uv_possible_blades +=
769 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); 771 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
770 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); 772
773 /* uv_num_possible_blades() is really the hub count */
774 printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
775 is_uv1_hub() ? uv_num_possible_blades() :
776 (uv_num_possible_blades() + 1) / 2,
777 uv_num_possible_blades());
771 778
772 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 779 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
773 uv_blade_info = kzalloc(bytes, GFP_KERNEL); 780 uv_blade_info = kzalloc(bytes, GFP_KERNEL);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd383953c..f76623cbe263 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
383static int ignore_normal_resume; 383static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385 385
386static int debug __read_mostly; 386static bool debug __read_mostly;
387static int smp __read_mostly; 387static bool smp __read_mostly;
388static int apm_disabled = -1; 388static int apm_disabled = -1;
389#ifdef CONFIG_SMP 389#ifdef CONFIG_SMP
390static int power_off; 390static bool power_off;
391#else 391#else
392static int power_off = 1; 392static bool power_off = 1;
393#endif 393#endif
394static int realmode_power_off; 394static bool realmode_power_off;
395#ifdef CONFIG_APM_ALLOW_INTS 395#ifdef CONFIG_APM_ALLOW_INTS
396static int allow_ints = 1; 396static bool allow_ints = 1;
397#else 397#else
398static int allow_ints; 398static bool allow_ints;
399#endif 399#endif
400static int broken_psr; 400static bool broken_psr;
401 401
402static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13fafc5264..68de2dc962ec 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version); 68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); 69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70 OFFSET(BP_pref_address, boot_params, hdr.pref_address);
71 OFFSET(BP_code32_start, boot_params, hdr.code32_start);
70} 72}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 395a10e68067..85d98ab15cdc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,6 +3,11 @@
3#include <linux/lguest.h> 3#include <linux/lguest.h>
4#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
5 5
6#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
7static char syscalls[] = {
8#include <asm/syscalls_32.h>
9};
10
6/* workaround for a warning with -Wmissing-prototypes */ 11/* workaround for a warning with -Wmissing-prototypes */
7void foo(void); 12void foo(void);
8 13
@@ -76,4 +81,7 @@ void foo(void)
76 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 81 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
77 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 82 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
78#endif 83#endif
84 BLANK();
85 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
86 DEFINE(NR_syscalls, sizeof(syscalls));
79} 87}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72a1194af22..834e897b1e25 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,11 +1,12 @@
1#include <asm/ia32.h> 1#include <asm/ia32.h>
2 2
3#define __NO_STUBS 1 3#define __SYSCALL_64(nr, sym, compat) [nr] = 1,
4#undef __SYSCALL 4static char syscalls_64[] = {
5#undef _ASM_X86_UNISTD_64_H 5#include <asm/syscalls_64.h>
6#define __SYSCALL(nr, sym) [nr] = 1, 6};
7static char syscalls[] = { 7#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
8#include <asm/unistd.h> 8static char syscalls_ia32[] = {
9#include <asm/syscalls_32.h>
9}; 10};
10 11
11int main(void) 12int main(void)
@@ -72,7 +73,11 @@ int main(void)
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 73 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
73 BLANK(); 74 BLANK();
74 75
75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); 76 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
77 DEFINE(NR_syscalls, sizeof(syscalls_64));
78
79 DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
80 DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
76 81
77 return 0; 82 return 0;
78} 83}
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 452932d34730..5da1269e8ddc 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
62 62
63void __init setup_bios_corruption_check(void) 63void __init setup_bios_corruption_check(void)
64{ 64{
65 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ 65 phys_addr_t start, end;
66 u64 i;
66 67
67 if (memory_corruption_check == -1) { 68 if (memory_corruption_check == -1) {
68 memory_corruption_check = 69 memory_corruption_check =
@@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void)
82 83
83 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 84 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
84 85
85 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { 86 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
86 u64 size; 87 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
87 addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); 88 PAGE_SIZE, corruption_check_size);
89 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
90 PAGE_SIZE, corruption_check_size);
91 if (start >= end)
92 continue;
88 93
89 if (addr == MEMBLOCK_ERROR) 94 memblock_reserve(start, end - start);
90 break; 95 scan_areas[num_scan_areas].addr = start;
91 96 scan_areas[num_scan_areas].size = end - start;
92 if (addr >= corruption_check_size)
93 break;
94
95 if ((addr + size) > corruption_check_size)
96 size = corruption_check_size - addr;
97
98 memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
99 scan_areas[num_scan_areas].addr = addr;
100 scan_areas[num_scan_areas].size = size;
101 num_scan_areas++;
102 97
103 /* Assume we've already mapped this early memory */ 98 /* Assume we've already mapped this early memory */
104 memset(__va(addr), 0, size); 99 memset(__va(start), 0, end - start);
105 100
106 addr += size; 101 if (++num_scan_areas >= MAX_SCAN_AREAS)
102 break;
107 } 103 }
108 104
109 if (num_scan_areas) 105 if (num_scan_areas)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c7e46cb35327..f4773f4aae35 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
148 148
149static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) 149static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
150{ 150{
151#ifdef CONFIG_SMP
152 /* calling is from identify_secondary_cpu() ? */ 151 /* calling is from identify_secondary_cpu() ? */
153 if (!c->cpu_index) 152 if (!c->cpu_index)
154 return; 153 return;
@@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
192 191
193valid_k7: 192valid_k7:
194 ; 193 ;
195#endif
196} 194}
197 195
198static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 196static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
353 if (node == NUMA_NO_NODE) 351 if (node == NUMA_NO_NODE)
354 node = per_cpu(cpu_llc_id, cpu); 352 node = per_cpu(cpu_llc_id, cpu);
355 353
354 /*
355 * If core numbers are inconsistent, it's likely a multi-fabric platform,
356 * so invoke platform-specific handler
357 */
358 if (c->phys_proc_id != node)
359 x86_cpuinit.fixup_cpu_id(c, node);
360
356 if (!node_online(node)) { 361 if (!node_online(node)) {
357 /* 362 /*
358 * Two possibilities here: 363 * Two possibilities here:
@@ -442,8 +447,6 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
442 447
443static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 448static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
444{ 449{
445 u32 dummy;
446
447 early_init_amd_mc(c); 450 early_init_amd_mc(c);
448 451
449 /* 452 /*
@@ -473,12 +476,12 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
473 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 476 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
474 } 477 }
475#endif 478#endif
476
477 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
478} 479}
479 480
480static void __cpuinit init_amd(struct cpuinfo_x86 *c) 481static void __cpuinit init_amd(struct cpuinfo_x86 *c)
481{ 482{
483 u32 dummy;
484
482#ifdef CONFIG_SMP 485#ifdef CONFIG_SMP
483 unsigned long long value; 486 unsigned long long value;
484 487
@@ -657,6 +660,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
657 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); 660 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
658 } 661 }
659 } 662 }
663
664 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
660} 665}
661 666
662#ifdef CONFIG_X86_32 667#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e58d978e0758..159103c0b1f4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
278 } 278 }
279#ifdef CONFIG_X86_32 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 9) { 281 if (c->x86_model >= 6 && c->x86_model <= 13) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
283 lo |= (1<<1 | 1<<7); 283 lo |= (1<<1 | 1<<7);
284 wrmsr(MSR_VIA_FCR, lo, hi); 284 wrmsr(MSR_VIA_FCR, lo, hi);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b13a831..c0f7d68d318f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
676 if (this_cpu->c_early_init) 676 if (this_cpu->c_early_init)
677 this_cpu->c_early_init(c); 677 this_cpu->c_early_init(c);
678 678
679#ifdef CONFIG_SMP
680 c->cpu_index = 0; 679 c->cpu_index = 0;
681#endif
682 filter_cpuid_features(c, false); 680 filter_cpuid_features(c, false);
683 681
684 setup_smep(c); 682 setup_smep(c);
@@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
764 c->apicid = c->initial_apicid; 762 c->apicid = c->initial_apicid;
765# endif 763# endif
766#endif 764#endif
767
768#ifdef CONFIG_X86_HT
769 c->phys_proc_id = c->initial_apicid; 765 c->phys_proc_id = c->initial_apicid;
770#endif
771 } 766 }
772 767
773 setup_smep(c); 768 setup_smep(c);
@@ -1026,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid);
1026 1021
1027#ifdef CONFIG_X86_64 1022#ifdef CONFIG_X86_64
1028struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; 1023struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
1024struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
1025 (unsigned long) nmi_idt_table };
1029 1026
1030DEFINE_PER_CPU_FIRST(union irq_stack_union, 1027DEFINE_PER_CPU_FIRST(union irq_stack_union,
1031 irq_stack_union) __aligned(PAGE_SIZE); 1028 irq_stack_union) __aligned(PAGE_SIZE);
@@ -1047,6 +1044,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1047 1044
1048DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1045DEFINE_PER_CPU(unsigned int, irq_count) = -1;
1049 1046
1047DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1048EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
1049
1050/* 1050/*
1051 * Special IST stacks which the CPU switches to when it calls 1051 * Special IST stacks which the CPU switches to when it calls
1052 * an IST-marked descriptor entry. Up to 7 stacks (hardware 1052 * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1090,10 +1090,32 @@ unsigned long kernel_eflags;
1090 */ 1090 */
1091DEFINE_PER_CPU(struct orig_ist, orig_ist); 1091DEFINE_PER_CPU(struct orig_ist, orig_ist);
1092 1092
1093static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
1094DEFINE_PER_CPU(int, debug_stack_usage);
1095
1096int is_debug_stack(unsigned long addr)
1097{
1098 return __get_cpu_var(debug_stack_usage) ||
1099 (addr <= __get_cpu_var(debug_stack_addr) &&
1100 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
1101}
1102
1103void debug_stack_set_zero(void)
1104{
1105 load_idt((const struct desc_ptr *)&nmi_idt_descr);
1106}
1107
1108void debug_stack_reset(void)
1109{
1110 load_idt((const struct desc_ptr *)&idt_descr);
1111}
1112
1093#else /* CONFIG_X86_64 */ 1113#else /* CONFIG_X86_64 */
1094 1114
1095DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1115DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1096EXPORT_PER_CPU_SYMBOL(current_task); 1116EXPORT_PER_CPU_SYMBOL(current_task);
1117DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1118EXPORT_PER_CPU_SYMBOL(fpu_owner_task);
1097 1119
1098#ifdef CONFIG_CC_STACKPROTECTOR 1120#ifdef CONFIG_CC_STACKPROTECTOR
1099DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1121DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
@@ -1141,6 +1163,15 @@ static void dbg_restore_debug_regs(void)
1141#endif /* ! CONFIG_KGDB */ 1163#endif /* ! CONFIG_KGDB */
1142 1164
1143/* 1165/*
1166 * Prints an error where the NUMA and configured core-number mismatch and the
1167 * platform didn't override this to fix it up
1168 */
1169void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
1170{
1171 pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
1172}
1173
1174/*
1144 * cpu_init() initializes state that is per-CPU. Some data is already 1175 * cpu_init() initializes state that is per-CPU. Some data is already
1145 * initialized (naturally) in the bootstrap process, such as the GDT 1176 * initialized (naturally) in the bootstrap process, such as the GDT
1146 * and IDT. We reload them nevertheless, this function acts as a 1177 * and IDT. We reload them nevertheless, this function acts as a
@@ -1208,6 +1239,8 @@ void __cpuinit cpu_init(void)
1208 estacks += exception_stack_sizes[v]; 1239 estacks += exception_stack_sizes[v];
1209 oist->ist[v] = t->x86_tss.ist[v] = 1240 oist->ist[v] = t->x86_tss.ist[v] =
1210 (unsigned long)estacks; 1241 (unsigned long)estacks;
1242 if (v == DEBUG_STACK-1)
1243 per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
1211 } 1244 }
1212 } 1245 }
1213 1246
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1b22dcc51af4..8bacc7826fb3 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,5 +1,4 @@
1#ifndef ARCH_X86_CPU_H 1#ifndef ARCH_X86_CPU_H
2
3#define ARCH_X86_CPU_H 2#define ARCH_X86_CPU_H
4 3
5struct cpu_model_info { 4struct cpu_model_info {
@@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
35 34
36extern void get_cpu_cap(struct cpuinfo_x86 *c); 35extern void get_cpu_cap(struct cpuinfo_x86 *c);
37extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
38extern void get_cpu_cap(struct cpuinfo_x86 *c); 37#endif /* ARCH_X86_CPU_H */
39
40#endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 523131213f08..3e6ff6cbf42a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)
181 181
182static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) 182static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
183{ 183{
184#ifdef CONFIG_SMP
185 /* calling is from identify_secondary_cpu() ? */ 184 /* calling is from identify_secondary_cpu() ? */
186 if (!c->cpu_index) 185 if (!c->cpu_index)
187 return; 186 return;
@@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
198 WARN_ONCE(1, "WARNING: SMP operation may be unreliable" 197 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
199 "with B stepping processors.\n"); 198 "with B stepping processors.\n");
200 } 199 }
201#endif
202} 200}
203 201
204static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 202static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c9..73d08ed98a64 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)
326 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 326 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
327} 327}
328 328
329static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, 329static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
330 int index)
331{ 330{
332 int node; 331 int node;
333 332
@@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
725#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) 724#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
726 725
727#ifdef CONFIG_SMP 726#ifdef CONFIG_SMP
728static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 727
728static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
729{ 729{
730 struct _cpuid4_info *this_leaf, *sibling_leaf; 730 struct _cpuid4_info *this_leaf;
731 unsigned long num_threads_sharing; 731 int ret, i, sibling;
732 int index_msb, i, sibling;
733 struct cpuinfo_x86 *c = &cpu_data(cpu); 732 struct cpuinfo_x86 *c = &cpu_data(cpu);
734 733
735 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 734 ret = 0;
735 if (index == 3) {
736 ret = 1;
736 for_each_cpu(i, cpu_llc_shared_mask(cpu)) { 737 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
737 if (!per_cpu(ici_cpuid4_info, i)) 738 if (!per_cpu(ici_cpuid4_info, i))
738 continue; 739 continue;
@@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
743 set_bit(sibling, this_leaf->shared_cpu_map); 744 set_bit(sibling, this_leaf->shared_cpu_map);
744 } 745 }
745 } 746 }
746 return; 747 } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) {
748 ret = 1;
749 for_each_cpu(i, cpu_sibling_mask(cpu)) {
750 if (!per_cpu(ici_cpuid4_info, i))
751 continue;
752 this_leaf = CPUID4_INFO_IDX(i, index);
753 for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
754 if (!cpu_online(sibling))
755 continue;
756 set_bit(sibling, this_leaf->shared_cpu_map);
757 }
758 }
747 } 759 }
760
761 return ret;
762}
763
764static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
765{
766 struct _cpuid4_info *this_leaf, *sibling_leaf;
767 unsigned long num_threads_sharing;
768 int index_msb, i;
769 struct cpuinfo_x86 *c = &cpu_data(cpu);
770
771 if (c->x86_vendor == X86_VENDOR_AMD) {
772 if (cache_shared_amd_cpu_map_setup(cpu, index))
773 return;
774 }
775
748 this_leaf = CPUID4_INFO_IDX(cpu, index); 776 this_leaf = CPUID4_INFO_IDX(cpu, index);
749 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; 777 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
750 778
@@ -844,8 +872,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
844 872
845#include <linux/kobject.h> 873#include <linux/kobject.h>
846#include <linux/sysfs.h> 874#include <linux/sysfs.h>
847 875#include <linux/cpu.h>
848extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
849 876
850/* pointer to kobject for cpuX/cache */ 877/* pointer to kobject for cpuX/cache */
851static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); 878static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1100,9 @@ err_out:
1073static DECLARE_BITMAP(cache_dev_map, NR_CPUS); 1100static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
1074 1101
1075/* Add/Remove cache interface for CPU device */ 1102/* Add/Remove cache interface for CPU device */
1076static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 1103static int __cpuinit cache_add_dev(struct device *dev)
1077{ 1104{
1078 unsigned int cpu = sys_dev->id; 1105 unsigned int cpu = dev->id;
1079 unsigned long i, j; 1106 unsigned long i, j;
1080 struct _index_kobject *this_object; 1107 struct _index_kobject *this_object;
1081 struct _cpuid4_info *this_leaf; 1108 struct _cpuid4_info *this_leaf;
@@ -1087,7 +1114,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1087 1114
1088 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), 1115 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
1089 &ktype_percpu_entry, 1116 &ktype_percpu_entry,
1090 &sys_dev->kobj, "%s", "cache"); 1117 &dev->kobj, "%s", "cache");
1091 if (retval < 0) { 1118 if (retval < 0) {
1092 cpuid4_cache_sysfs_exit(cpu); 1119 cpuid4_cache_sysfs_exit(cpu);
1093 return retval; 1120 return retval;
@@ -1124,9 +1151,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1124 return 0; 1151 return 0;
1125} 1152}
1126 1153
1127static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 1154static void __cpuinit cache_remove_dev(struct device *dev)
1128{ 1155{
1129 unsigned int cpu = sys_dev->id; 1156 unsigned int cpu = dev->id;
1130 unsigned long i; 1157 unsigned long i;
1131 1158
1132 if (per_cpu(ici_cpuid4_info, cpu) == NULL) 1159 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1172,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
1145 unsigned long action, void *hcpu) 1172 unsigned long action, void *hcpu)
1146{ 1173{
1147 unsigned int cpu = (unsigned long)hcpu; 1174 unsigned int cpu = (unsigned long)hcpu;
1148 struct sys_device *sys_dev; 1175 struct device *dev;
1149 1176
1150 sys_dev = get_cpu_sysdev(cpu); 1177 dev = get_cpu_device(cpu);
1151 switch (action) { 1178 switch (action) {
1152 case CPU_ONLINE: 1179 case CPU_ONLINE:
1153 case CPU_ONLINE_FROZEN: 1180 case CPU_ONLINE_FROZEN:
1154 cache_add_dev(sys_dev); 1181 cache_add_dev(dev);
1155 break; 1182 break;
1156 case CPU_DEAD: 1183 case CPU_DEAD:
1157 case CPU_DEAD_FROZEN: 1184 case CPU_DEAD_FROZEN:
1158 cache_remove_dev(sys_dev); 1185 cache_remove_dev(dev);
1159 break; 1186 break;
1160 } 1187 }
1161 return NOTIFY_OK; 1188 return NOTIFY_OK;
@@ -1174,9 +1201,9 @@ static int __cpuinit cache_sysfs_init(void)
1174 1201
1175 for_each_online_cpu(i) { 1202 for_each_online_cpu(i) {
1176 int err; 1203 int err;
1177 struct sys_device *sys_dev = get_cpu_sysdev(i); 1204 struct device *dev = get_cpu_device(i);
1178 1205
1179 err = cache_add_dev(sys_dev); 1206 err = cache_add_dev(dev);
1180 if (err) 1207 if (err)
1181 return err; 1208 return err;
1182 } 1209 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 319882ef848d..fc4beb393577 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -17,6 +17,7 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/preempt.h>
20#include <linux/smp.h> 21#include <linux/smp.h>
21#include <linux/notifier.h> 22#include <linux/notifier.h>
22#include <linux/kdebug.h> 23#include <linux/kdebug.h>
@@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
92 return NMI_HANDLED; 93 return NMI_HANDLED;
93} 94}
94 95
96static void mce_irq_ipi(void *info)
97{
98 int cpu = smp_processor_id();
99 struct mce *m = &__get_cpu_var(injectm);
100
101 if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
102 m->inject_flags & MCJ_EXCEPTION) {
103 cpumask_clear_cpu(cpu, mce_inject_cpumask);
104 raise_exception(m, NULL);
105 }
106}
107
95/* Inject mce on current CPU */ 108/* Inject mce on current CPU */
96static int raise_local(void) 109static int raise_local(void)
97{ 110{
@@ -139,9 +152,10 @@ static void raise_mce(struct mce *m)
139 return; 152 return;
140 153
141#ifdef CONFIG_X86_LOCAL_APIC 154#ifdef CONFIG_X86_LOCAL_APIC
142 if (m->inject_flags & MCJ_NMI_BROADCAST) { 155 if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
143 unsigned long start; 156 unsigned long start;
144 int cpu; 157 int cpu;
158
145 get_online_cpus(); 159 get_online_cpus();
146 cpumask_copy(mce_inject_cpumask, cpu_online_mask); 160 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
147 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); 161 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
@@ -151,13 +165,25 @@ static void raise_mce(struct mce *m)
151 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 165 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
152 cpumask_clear_cpu(cpu, mce_inject_cpumask); 166 cpumask_clear_cpu(cpu, mce_inject_cpumask);
153 } 167 }
154 if (!cpumask_empty(mce_inject_cpumask)) 168 if (!cpumask_empty(mce_inject_cpumask)) {
155 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); 169 if (m->inject_flags & MCJ_IRQ_BRAODCAST) {
170 /*
171 * don't wait because mce_irq_ipi is necessary
172 * to be sync with following raise_local
173 */
174 preempt_disable();
175 smp_call_function_many(mce_inject_cpumask,
176 mce_irq_ipi, NULL, 0);
177 preempt_enable();
178 } else if (m->inject_flags & MCJ_NMI_BROADCAST)
179 apic->send_IPI_mask(mce_inject_cpumask,
180 NMI_VECTOR);
181 }
156 start = jiffies; 182 start = jiffies;
157 while (!cpumask_empty(mce_inject_cpumask)) { 183 while (!cpumask_empty(mce_inject_cpumask)) {
158 if (!time_before(jiffies, start + 2*HZ)) { 184 if (!time_before(jiffies, start + 2*HZ)) {
159 printk(KERN_ERR 185 printk(KERN_ERR
160 "Timeout waiting for mce inject NMI %lx\n", 186 "Timeout waiting for mce inject %lx\n",
161 *cpumask_bits(mce_inject_cpumask)); 187 *cpumask_bits(mce_inject_cpumask));
162 break; 188 break;
163 } 189 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b5..ed44c8a65858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
1#include <linux/sysdev.h> 1#include <linux/device.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4enum severity_level { 4enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
17struct mce_bank { 17struct mce_bank {
18 u64 ctl; /* subevents to enable */ 18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */ 19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */ 20 struct device_attribute attr; /* device attribute */
21 char attrname[ATTR_LEN]; /* attribute name */ 21 char attrname[ATTR_LEN]; /* attribute name */
22}; 22};
23 23
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 2af127d4c3d1..5a11ae2e9e91 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/string.h> 21#include <linux/string.h>
22#include <linux/sysdev.h> 22#include <linux/device.h>
23#include <linux/syscore_ops.h> 23#include <linux/syscore_ops.h>
24#include <linux/delay.h> 24#include <linux/delay.h>
25#include <linux/ctype.h> 25#include <linux/ctype.h>
@@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
95static DEFINE_PER_CPU(struct mce, mces_seen); 95static DEFINE_PER_CPU(struct mce, mces_seen);
96static int cpu_missing; 96static int cpu_missing;
97 97
98/*
99 * CPU/chipset specific EDAC code can register a notifier call here to print
100 * MCE errors in a human-readable form.
101 */
102ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
103EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
104
105/* MCA banks polled by the period polling timer for corrected events */ 98/* MCA banks polled by the period polling timer for corrected events */
106DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 99DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
107 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 100 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
109 102
110static DEFINE_PER_CPU(struct work_struct, mce_work); 103static DEFINE_PER_CPU(struct work_struct, mce_work);
111 104
105/*
106 * CPU/chipset specific EDAC code can register a notifier call here to print
107 * MCE errors in a human-readable form.
108 */
109ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
110
112/* Do initial initialization of a struct mce */ 111/* Do initial initialization of a struct mce */
113void mce_setup(struct mce *m) 112void mce_setup(struct mce *m)
114{ 113{
@@ -119,9 +118,7 @@ void mce_setup(struct mce *m)
119 m->time = get_seconds(); 118 m->time = get_seconds();
120 m->cpuvendor = boot_cpu_data.x86_vendor; 119 m->cpuvendor = boot_cpu_data.x86_vendor;
121 m->cpuid = cpuid_eax(1); 120 m->cpuid = cpuid_eax(1);
122#ifdef CONFIG_SMP
123 m->socketid = cpu_data(m->extcpu).phys_proc_id; 121 m->socketid = cpu_data(m->extcpu).phys_proc_id;
124#endif
125 m->apicid = cpu_data(m->extcpu).initial_apicid; 122 m->apicid = cpu_data(m->extcpu).initial_apicid;
126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 123 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
127} 124}
@@ -190,6 +187,57 @@ void mce_log(struct mce *mce)
190 set_bit(0, &mce_need_notify); 187 set_bit(0, &mce_need_notify);
191} 188}
192 189
190static void drain_mcelog_buffer(void)
191{
192 unsigned int next, i, prev = 0;
193
194 next = rcu_dereference_check_mce(mcelog.next);
195
196 do {
197 struct mce *m;
198
199 /* drain what was logged during boot */
200 for (i = prev; i < next; i++) {
201 unsigned long start = jiffies;
202 unsigned retries = 1;
203
204 m = &mcelog.entry[i];
205
206 while (!m->finished) {
207 if (time_after_eq(jiffies, start + 2*retries))
208 retries++;
209
210 cpu_relax();
211
212 if (!m->finished && retries >= 4) {
213 pr_err("MCE: skipping error being logged currently!\n");
214 break;
215 }
216 }
217 smp_rmb();
218 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
219 }
220
221 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
222 prev = next;
223 next = cmpxchg(&mcelog.next, prev, 0);
224 } while (next != prev);
225}
226
227
228void mce_register_decode_chain(struct notifier_block *nb)
229{
230 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
231 drain_mcelog_buffer();
232}
233EXPORT_SYMBOL_GPL(mce_register_decode_chain);
234
235void mce_unregister_decode_chain(struct notifier_block *nb)
236{
237 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
238}
239EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
240
193static void print_mce(struct mce *m) 241static void print_mce(struct mce *m)
194{ 242{
195 int ret = 0; 243 int ret = 0;
@@ -1770,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {
1770}; 1818};
1771 1819
1772/* 1820/*
1773 * mce_sysdev: Sysfs support 1821 * mce_device: Sysfs support
1774 */ 1822 */
1775 1823
1776static void mce_cpu_restart(void *data) 1824static void mce_cpu_restart(void *data)
@@ -1806,27 +1854,28 @@ static void mce_enable_ce(void *all)
1806 __mcheck_cpu_init_timer(); 1854 __mcheck_cpu_init_timer();
1807} 1855}
1808 1856
1809static struct sysdev_class mce_sysdev_class = { 1857static struct bus_type mce_subsys = {
1810 .name = "machinecheck", 1858 .name = "machinecheck",
1859 .dev_name = "machinecheck",
1811}; 1860};
1812 1861
1813DEFINE_PER_CPU(struct sys_device, mce_sysdev); 1862struct device *mce_device[CONFIG_NR_CPUS];
1814 1863
1815__cpuinitdata 1864__cpuinitdata
1816void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1865void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1817 1866
1818static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) 1867static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
1819{ 1868{
1820 return container_of(attr, struct mce_bank, attr); 1869 return container_of(attr, struct mce_bank, attr);
1821} 1870}
1822 1871
1823static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1872static ssize_t show_bank(struct device *s, struct device_attribute *attr,
1824 char *buf) 1873 char *buf)
1825{ 1874{
1826 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1875 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1827} 1876}
1828 1877
1829static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1878static ssize_t set_bank(struct device *s, struct device_attribute *attr,
1830 const char *buf, size_t size) 1879 const char *buf, size_t size)
1831{ 1880{
1832 u64 new; 1881 u64 new;
@@ -1841,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1841} 1890}
1842 1891
1843static ssize_t 1892static ssize_t
1844show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) 1893show_trigger(struct device *s, struct device_attribute *attr, char *buf)
1845{ 1894{
1846 strcpy(buf, mce_helper); 1895 strcpy(buf, mce_helper);
1847 strcat(buf, "\n"); 1896 strcat(buf, "\n");
1848 return strlen(mce_helper) + 1; 1897 return strlen(mce_helper) + 1;
1849} 1898}
1850 1899
1851static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, 1900static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
1852 const char *buf, size_t siz) 1901 const char *buf, size_t siz)
1853{ 1902{
1854 char *p; 1903 char *p;
@@ -1863,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
1863 return strlen(mce_helper) + !!p; 1912 return strlen(mce_helper) + !!p;
1864} 1913}
1865 1914
1866static ssize_t set_ignore_ce(struct sys_device *s, 1915static ssize_t set_ignore_ce(struct device *s,
1867 struct sysdev_attribute *attr, 1916 struct device_attribute *attr,
1868 const char *buf, size_t size) 1917 const char *buf, size_t size)
1869{ 1918{
1870 u64 new; 1919 u64 new;
@@ -1887,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
1887 return size; 1936 return size;
1888} 1937}
1889 1938
1890static ssize_t set_cmci_disabled(struct sys_device *s, 1939static ssize_t set_cmci_disabled(struct device *s,
1891 struct sysdev_attribute *attr, 1940 struct device_attribute *attr,
1892 const char *buf, size_t size) 1941 const char *buf, size_t size)
1893{ 1942{
1894 u64 new; 1943 u64 new;
@@ -1910,108 +1959,117 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
1910 return size; 1959 return size;
1911} 1960}
1912 1961
1913static ssize_t store_int_with_restart(struct sys_device *s, 1962static ssize_t store_int_with_restart(struct device *s,
1914 struct sysdev_attribute *attr, 1963 struct device_attribute *attr,
1915 const char *buf, size_t size) 1964 const char *buf, size_t size)
1916{ 1965{
1917 ssize_t ret = sysdev_store_int(s, attr, buf, size); 1966 ssize_t ret = device_store_int(s, attr, buf, size);
1918 mce_restart(); 1967 mce_restart();
1919 return ret; 1968 return ret;
1920} 1969}
1921 1970
1922static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 1971static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
1923static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); 1972static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
1924static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 1973static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1925static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 1974static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1926 1975
1927static struct sysdev_ext_attribute attr_check_interval = { 1976static struct dev_ext_attribute dev_attr_check_interval = {
1928 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, 1977 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
1929 store_int_with_restart),
1930 &check_interval 1978 &check_interval
1931}; 1979};
1932 1980
1933static struct sysdev_ext_attribute attr_ignore_ce = { 1981static struct dev_ext_attribute dev_attr_ignore_ce = {
1934 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), 1982 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
1935 &mce_ignore_ce 1983 &mce_ignore_ce
1936}; 1984};
1937 1985
1938static struct sysdev_ext_attribute attr_cmci_disabled = { 1986static struct dev_ext_attribute dev_attr_cmci_disabled = {
1939 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), 1987 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
1940 &mce_cmci_disabled 1988 &mce_cmci_disabled
1941}; 1989};
1942 1990
1943static struct sysdev_attribute *mce_sysdev_attrs[] = { 1991static struct device_attribute *mce_device_attrs[] = {
1944 &attr_tolerant.attr, 1992 &dev_attr_tolerant.attr,
1945 &attr_check_interval.attr, 1993 &dev_attr_check_interval.attr,
1946 &attr_trigger, 1994 &dev_attr_trigger,
1947 &attr_monarch_timeout.attr, 1995 &dev_attr_monarch_timeout.attr,
1948 &attr_dont_log_ce.attr, 1996 &dev_attr_dont_log_ce.attr,
1949 &attr_ignore_ce.attr, 1997 &dev_attr_ignore_ce.attr,
1950 &attr_cmci_disabled.attr, 1998 &dev_attr_cmci_disabled.attr,
1951 NULL 1999 NULL
1952}; 2000};
1953 2001
1954static cpumask_var_t mce_sysdev_initialized; 2002static cpumask_var_t mce_device_initialized;
2003
2004static void mce_device_release(struct device *dev)
2005{
2006 kfree(dev);
2007}
1955 2008
1956/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ 2009/* Per cpu device init. All of the cpus still share the same ctrl bank: */
1957static __cpuinit int mce_sysdev_create(unsigned int cpu) 2010static __cpuinit int mce_device_create(unsigned int cpu)
1958{ 2011{
1959 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 2012 struct device *dev;
1960 int err; 2013 int err;
1961 int i, j; 2014 int i, j;
1962 2015
1963 if (!mce_available(&boot_cpu_data)) 2016 if (!mce_available(&boot_cpu_data))
1964 return -EIO; 2017 return -EIO;
1965 2018
1966 memset(&sysdev->kobj, 0, sizeof(struct kobject)); 2019 dev = kzalloc(sizeof *dev, GFP_KERNEL);
1967 sysdev->id = cpu; 2020 if (!dev)
1968 sysdev->cls = &mce_sysdev_class; 2021 return -ENOMEM;
2022 dev->id = cpu;
2023 dev->bus = &mce_subsys;
2024 dev->release = &mce_device_release;
1969 2025
1970 err = sysdev_register(sysdev); 2026 err = device_register(dev);
1971 if (err) 2027 if (err)
1972 return err; 2028 return err;
1973 2029
1974 for (i = 0; mce_sysdev_attrs[i]; i++) { 2030 for (i = 0; mce_device_attrs[i]; i++) {
1975 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); 2031 err = device_create_file(dev, mce_device_attrs[i]);
1976 if (err) 2032 if (err)
1977 goto error; 2033 goto error;
1978 } 2034 }
1979 for (j = 0; j < banks; j++) { 2035 for (j = 0; j < banks; j++) {
1980 err = sysdev_create_file(sysdev, &mce_banks[j].attr); 2036 err = device_create_file(dev, &mce_banks[j].attr);
1981 if (err) 2037 if (err)
1982 goto error2; 2038 goto error2;
1983 } 2039 }
1984 cpumask_set_cpu(cpu, mce_sysdev_initialized); 2040 cpumask_set_cpu(cpu, mce_device_initialized);
2041 mce_device[cpu] = dev;
1985 2042
1986 return 0; 2043 return 0;
1987error2: 2044error2:
1988 while (--j >= 0) 2045 while (--j >= 0)
1989 sysdev_remove_file(sysdev, &mce_banks[j].attr); 2046 device_remove_file(dev, &mce_banks[j].attr);
1990error: 2047error:
1991 while (--i >= 0) 2048 while (--i >= 0)
1992 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 2049 device_remove_file(dev, mce_device_attrs[i]);
1993 2050
1994 sysdev_unregister(sysdev); 2051 device_unregister(dev);
1995 2052
1996 return err; 2053 return err;
1997} 2054}
1998 2055
1999static __cpuinit void mce_sysdev_remove(unsigned int cpu) 2056static __cpuinit void mce_device_remove(unsigned int cpu)
2000{ 2057{
2001 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); 2058 struct device *dev = mce_device[cpu];
2002 int i; 2059 int i;
2003 2060
2004 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) 2061 if (!cpumask_test_cpu(cpu, mce_device_initialized))
2005 return; 2062 return;
2006 2063
2007 for (i = 0; mce_sysdev_attrs[i]; i++) 2064 for (i = 0; mce_device_attrs[i]; i++)
2008 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); 2065 device_remove_file(dev, mce_device_attrs[i]);
2009 2066
2010 for (i = 0; i < banks; i++) 2067 for (i = 0; i < banks; i++)
2011 sysdev_remove_file(sysdev, &mce_banks[i].attr); 2068 device_remove_file(dev, &mce_banks[i].attr);
2012 2069
2013 sysdev_unregister(sysdev); 2070 device_unregister(dev);
2014 cpumask_clear_cpu(cpu, mce_sysdev_initialized); 2071 cpumask_clear_cpu(cpu, mce_device_initialized);
2072 mce_device[cpu] = NULL;
2015} 2073}
2016 2074
2017/* Make sure there are no machine checks on offlined CPUs. */ 2075/* Make sure there are no machine checks on offlined CPUs. */
@@ -2061,7 +2119,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2061 switch (action) { 2119 switch (action) {
2062 case CPU_ONLINE: 2120 case CPU_ONLINE:
2063 case CPU_ONLINE_FROZEN: 2121 case CPU_ONLINE_FROZEN:
2064 mce_sysdev_create(cpu); 2122 mce_device_create(cpu);
2065 if (threshold_cpu_callback) 2123 if (threshold_cpu_callback)
2066 threshold_cpu_callback(action, cpu); 2124 threshold_cpu_callback(action, cpu);
2067 break; 2125 break;
@@ -2069,7 +2127,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2069 case CPU_DEAD_FROZEN: 2127 case CPU_DEAD_FROZEN:
2070 if (threshold_cpu_callback) 2128 if (threshold_cpu_callback)
2071 threshold_cpu_callback(action, cpu); 2129 threshold_cpu_callback(action, cpu);
2072 mce_sysdev_remove(cpu); 2130 mce_device_remove(cpu);
2073 break; 2131 break;
2074 case CPU_DOWN_PREPARE: 2132 case CPU_DOWN_PREPARE:
2075 case CPU_DOWN_PREPARE_FROZEN: 2133 case CPU_DOWN_PREPARE_FROZEN:
@@ -2103,7 +2161,7 @@ static __init void mce_init_banks(void)
2103 2161
2104 for (i = 0; i < banks; i++) { 2162 for (i = 0; i < banks; i++) {
2105 struct mce_bank *b = &mce_banks[i]; 2163 struct mce_bank *b = &mce_banks[i];
2106 struct sysdev_attribute *a = &b->attr; 2164 struct device_attribute *a = &b->attr;
2107 2165
2108 sysfs_attr_init(&a->attr); 2166 sysfs_attr_init(&a->attr);
2109 a->attr.name = b->attrname; 2167 a->attr.name = b->attrname;
@@ -2123,16 +2181,16 @@ static __init int mcheck_init_device(void)
2123 if (!mce_available(&boot_cpu_data)) 2181 if (!mce_available(&boot_cpu_data))
2124 return -EIO; 2182 return -EIO;
2125 2183
2126 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); 2184 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2127 2185
2128 mce_init_banks(); 2186 mce_init_banks();
2129 2187
2130 err = sysdev_class_register(&mce_sysdev_class); 2188 err = subsys_system_register(&mce_subsys, NULL);
2131 if (err) 2189 if (err)
2132 return err; 2190 return err;
2133 2191
2134 for_each_online_cpu(i) { 2192 for_each_online_cpu(i) {
2135 err = mce_sysdev_create(i); 2193 err = mce_device_create(i);
2136 if (err) 2194 if (err)
2137 return err; 2195 return err;
2138 } 2196 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f5474218cffe..e4eeaaf58a47 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
17#include <linux/notifier.h> 17#include <linux/notifier.h>
18#include <linux/kobject.h> 18#include <linux/kobject.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/sysdev.h>
21#include <linux/errno.h> 20#include <linux/errno.h>
22#include <linux/sched.h> 21#include <linux/sched.h>
23#include <linux/sysfs.h> 22#include <linux/sysfs.h>
@@ -64,11 +63,9 @@ struct threshold_bank {
64}; 63};
65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); 64static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
66 65
67#ifdef CONFIG_SMP
68static unsigned char shared_bank[NR_BANKS] = { 66static unsigned char shared_bank[NR_BANKS] = {
69 0, 0, 0, 0, 1 67 0, 0, 0, 0, 1
70}; 68};
71#endif
72 69
73static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 70static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
74 71
@@ -202,10 +199,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
202 199
203 if (!block) 200 if (!block)
204 per_cpu(bank_map, cpu) |= (1 << bank); 201 per_cpu(bank_map, cpu) |= (1 << bank);
205#ifdef CONFIG_SMP
206 if (shared_bank[bank] && c->cpu_core_id) 202 if (shared_bank[bank] && c->cpu_core_id)
207 break; 203 break;
208#endif 204
209 offset = setup_APIC_mce(offset, 205 offset = setup_APIC_mce(offset,
210 (high & MASK_LVTOFF_HI) >> 20); 206 (high & MASK_LVTOFF_HI) >> 20);
211 207
@@ -527,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527{ 523{
528 int i, err = 0; 524 int i, err = 0;
529 struct threshold_bank *b = NULL; 525 struct threshold_bank *b = NULL;
526 struct device *dev = mce_device[cpu];
530 char name[32]; 527 char name[32];
531 528
532 sprintf(name, "threshold_bank%i", bank); 529 sprintf(name, "threshold_bank%i", bank);
@@ -548,8 +545,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
548 if (!b) 545 if (!b)
549 goto out; 546 goto out;
550 547
551 err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, 548 err = sysfs_create_link(&dev->kobj, b->kobj, name);
552 b->kobj, name);
553 if (err) 549 if (err)
554 goto out; 550 goto out;
555 551
@@ -571,7 +567,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
571 goto out; 567 goto out;
572 } 568 }
573 569
574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); 570 b->kobj = kobject_create_and_add(name, &dev->kobj);
575 if (!b->kobj) 571 if (!b->kobj)
576 goto out_free; 572 goto out_free;
577 573
@@ -591,8 +587,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
591 if (i == cpu) 587 if (i == cpu)
592 continue; 588 continue;
593 589
594 err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, 590 dev = mce_device[i];
595 b->kobj, name); 591 if (dev)
592 err = sysfs_create_link(&dev->kobj,b->kobj, name);
596 if (err) 593 if (err)
597 goto out; 594 goto out;
598 595
@@ -655,6 +652,7 @@ static void deallocate_threshold_block(unsigned int cpu,
655static void threshold_remove_bank(unsigned int cpu, int bank) 652static void threshold_remove_bank(unsigned int cpu, int bank)
656{ 653{
657 struct threshold_bank *b; 654 struct threshold_bank *b;
655 struct device *dev;
658 char name[32]; 656 char name[32];
659 int i = 0; 657 int i = 0;
660 658
@@ -669,7 +667,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
669#ifdef CONFIG_SMP 667#ifdef CONFIG_SMP
670 /* sibling symlink */ 668 /* sibling symlink */
671 if (shared_bank[bank] && b->blocks->cpu != cpu) { 669 if (shared_bank[bank] && b->blocks->cpu != cpu) {
672 sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); 670 sysfs_remove_link(&mce_device[cpu]->kobj, name);
673 per_cpu(threshold_banks, cpu)[bank] = NULL; 671 per_cpu(threshold_banks, cpu)[bank] = NULL;
674 672
675 return; 673 return;
@@ -681,7 +679,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
681 if (i == cpu) 679 if (i == cpu)
682 continue; 680 continue;
683 681
684 sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); 682 dev = mce_device[i];
683 if (dev)
684 sysfs_remove_link(&dev->kobj, name);
685 per_cpu(threshold_banks, i)[bank] = NULL; 685 per_cpu(threshold_banks, i)[bank] = NULL;
686 } 686 }
687 687
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..67bb17a37a0a 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/export.h> 21#include <linux/export.h>
22#include <linux/sysdev.h>
23#include <linux/types.h> 22#include <linux/types.h>
24#include <linux/init.h> 23#include <linux/init.h>
25#include <linux/smp.h> 24#include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0);
69static u32 lvtthmr_init __read_mostly; 68static u32 lvtthmr_init __read_mostly;
70 69
71#ifdef CONFIG_SYSFS 70#ifdef CONFIG_SYSFS
72#define define_therm_throt_sysdev_one_ro(_name) \ 71#define define_therm_throt_device_one_ro(_name) \
73 static SYSDEV_ATTR(_name, 0444, \ 72 static DEVICE_ATTR(_name, 0444, \
74 therm_throt_sysdev_show_##_name, \ 73 therm_throt_device_show_##_name, \
75 NULL) \ 74 NULL) \
76 75
77#define define_therm_throt_sysdev_show_func(event, name) \ 76#define define_therm_throt_device_show_func(event, name) \
78 \ 77 \
79static ssize_t therm_throt_sysdev_show_##event##_##name( \ 78static ssize_t therm_throt_device_show_##event##_##name( \
80 struct sys_device *dev, \ 79 struct device *dev, \
81 struct sysdev_attribute *attr, \ 80 struct device_attribute *attr, \
82 char *buf) \ 81 char *buf) \
83{ \ 82{ \
84 unsigned int cpu = dev->id; \ 83 unsigned int cpu = dev->id; \
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \
95 return ret; \ 94 return ret; \
96} 95}
97 96
98define_therm_throt_sysdev_show_func(core_throttle, count); 97define_therm_throt_device_show_func(core_throttle, count);
99define_therm_throt_sysdev_one_ro(core_throttle_count); 98define_therm_throt_device_one_ro(core_throttle_count);
100 99
101define_therm_throt_sysdev_show_func(core_power_limit, count); 100define_therm_throt_device_show_func(core_power_limit, count);
102define_therm_throt_sysdev_one_ro(core_power_limit_count); 101define_therm_throt_device_one_ro(core_power_limit_count);
103 102
104define_therm_throt_sysdev_show_func(package_throttle, count); 103define_therm_throt_device_show_func(package_throttle, count);
105define_therm_throt_sysdev_one_ro(package_throttle_count); 104define_therm_throt_device_one_ro(package_throttle_count);
106 105
107define_therm_throt_sysdev_show_func(package_power_limit, count); 106define_therm_throt_device_show_func(package_power_limit, count);
108define_therm_throt_sysdev_one_ro(package_power_limit_count); 107define_therm_throt_device_one_ro(package_power_limit_count);
109 108
110static struct attribute *thermal_throttle_attrs[] = { 109static struct attribute *thermal_throttle_attrs[] = {
111 &attr_core_throttle_count.attr, 110 &dev_attr_core_throttle_count.attr,
112 NULL 111 NULL
113}; 112};
114 113
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
223 222
224#ifdef CONFIG_SYSFS 223#ifdef CONFIG_SYSFS
225/* Add/Remove thermal_throttle interface for CPU device: */ 224/* Add/Remove thermal_throttle interface for CPU device: */
226static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 225static __cpuinit int thermal_throttle_add_dev(struct device *dev,
227 unsigned int cpu) 226 unsigned int cpu)
228{ 227{
229 int err; 228 int err;
230 struct cpuinfo_x86 *c = &cpu_data(cpu); 229 struct cpuinfo_x86 *c = &cpu_data(cpu);
231 230
232 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); 231 err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
233 if (err) 232 if (err)
234 return err; 233 return err;
235 234
236 if (cpu_has(c, X86_FEATURE_PLN)) 235 if (cpu_has(c, X86_FEATURE_PLN))
237 err = sysfs_add_file_to_group(&sys_dev->kobj, 236 err = sysfs_add_file_to_group(&dev->kobj,
238 &attr_core_power_limit_count.attr, 237 &dev_attr_core_power_limit_count.attr,
239 thermal_attr_group.name); 238 thermal_attr_group.name);
240 if (cpu_has(c, X86_FEATURE_PTS)) { 239 if (cpu_has(c, X86_FEATURE_PTS)) {
241 err = sysfs_add_file_to_group(&sys_dev->kobj, 240 err = sysfs_add_file_to_group(&dev->kobj,
242 &attr_package_throttle_count.attr, 241 &dev_attr_package_throttle_count.attr,
243 thermal_attr_group.name); 242 thermal_attr_group.name);
244 if (cpu_has(c, X86_FEATURE_PLN)) 243 if (cpu_has(c, X86_FEATURE_PLN))
245 err = sysfs_add_file_to_group(&sys_dev->kobj, 244 err = sysfs_add_file_to_group(&dev->kobj,
246 &attr_package_power_limit_count.attr, 245 &dev_attr_package_power_limit_count.attr,
247 thermal_attr_group.name); 246 thermal_attr_group.name);
248 } 247 }
249 248
250 return err; 249 return err;
251} 250}
252 251
253static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 252static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
254{ 253{
255 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); 254 sysfs_remove_group(&dev->kobj, &thermal_attr_group);
256} 255}
257 256
258/* Mutex protecting device creation against CPU hotplug: */ 257/* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
265 void *hcpu) 264 void *hcpu)
266{ 265{
267 unsigned int cpu = (unsigned long)hcpu; 266 unsigned int cpu = (unsigned long)hcpu;
268 struct sys_device *sys_dev; 267 struct device *dev;
269 int err = 0; 268 int err = 0;
270 269
271 sys_dev = get_cpu_sysdev(cpu); 270 dev = get_cpu_device(cpu);
272 271
273 switch (action) { 272 switch (action) {
274 case CPU_UP_PREPARE: 273 case CPU_UP_PREPARE:
275 case CPU_UP_PREPARE_FROZEN: 274 case CPU_UP_PREPARE_FROZEN:
276 mutex_lock(&therm_cpu_lock); 275 mutex_lock(&therm_cpu_lock);
277 err = thermal_throttle_add_dev(sys_dev, cpu); 276 err = thermal_throttle_add_dev(dev, cpu);
278 mutex_unlock(&therm_cpu_lock); 277 mutex_unlock(&therm_cpu_lock);
279 WARN_ON(err); 278 WARN_ON(err);
280 break; 279 break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
283 case CPU_DEAD: 282 case CPU_DEAD:
284 case CPU_DEAD_FROZEN: 283 case CPU_DEAD_FROZEN:
285 mutex_lock(&therm_cpu_lock); 284 mutex_lock(&therm_cpu_lock);
286 thermal_throttle_remove_dev(sys_dev); 285 thermal_throttle_remove_dev(dev);
287 mutex_unlock(&therm_cpu_lock); 286 mutex_unlock(&therm_cpu_lock);
288 break; 287 break;
289 } 288 }
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
310#endif 309#endif
311 /* connect live CPUs to sysfs */ 310 /* connect live CPUs to sysfs */
312 for_each_online_cpu(cpu) { 311 for_each_online_cpu(cpu) {
313 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); 312 err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
314 WARN_ON(err); 313 WARN_ON(err);
315 } 314 }
316#ifdef CONFIG_HOTPLUG_CPU 315#ifdef CONFIG_HOTPLUG_CPU
@@ -323,17 +322,6 @@ device_initcall(thermal_throttle_init_device);
323 322
324#endif /* CONFIG_SYSFS */ 323#endif /* CONFIG_SYSFS */
325 324
326/*
327 * Set up the most two significant bit to notify mce log that this thermal
328 * event type.
329 * This is a temp solution. May be changed in the future with mce log
330 * infrasture.
331 */
332#define CORE_THROTTLED (0)
333#define CORE_POWER_LIMIT ((__u64)1 << 62)
334#define PACKAGE_THROTTLED ((__u64)2 << 62)
335#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
336
337static void notify_thresholds(__u64 msr_val) 325static void notify_thresholds(__u64 msr_val)
338{ 326{
339 /* check whether the interrupt handler is defined; 327 /* check whether the interrupt handler is defined;
@@ -363,27 +351,23 @@ static void intel_thermal_interrupt(void)
363 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 351 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
364 THERMAL_THROTTLING_EVENT, 352 THERMAL_THROTTLING_EVENT,
365 CORE_LEVEL) != 0) 353 CORE_LEVEL) != 0)
366 mce_log_therm_throt_event(CORE_THROTTLED | msr_val); 354 mce_log_therm_throt_event(msr_val);
367 355
368 if (this_cpu_has(X86_FEATURE_PLN)) 356 if (this_cpu_has(X86_FEATURE_PLN))
369 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 357 therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
370 POWER_LIMIT_EVENT, 358 POWER_LIMIT_EVENT,
371 CORE_LEVEL) != 0) 359 CORE_LEVEL);
372 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
373 360
374 if (this_cpu_has(X86_FEATURE_PTS)) { 361 if (this_cpu_has(X86_FEATURE_PTS)) {
375 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 362 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
376 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 363 therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
377 THERMAL_THROTTLING_EVENT, 364 THERMAL_THROTTLING_EVENT,
378 PACKAGE_LEVEL) != 0) 365 PACKAGE_LEVEL);
379 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
380 if (this_cpu_has(X86_FEATURE_PLN)) 366 if (this_cpu_has(X86_FEATURE_PLN))
381 if (therm_throt_process(msr_val & 367 therm_throt_process(msr_val &
382 PACKAGE_THERM_STATUS_POWER_LIMIT, 368 PACKAGE_THERM_STATUS_POWER_LIMIT,
383 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
384 PACKAGE_LEVEL) != 0) 370 PACKAGE_LEVEL);
385 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
386 | msr_val);
387 } 371 }
388} 372}
389 373
@@ -397,8 +381,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
397 381
398asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) 382asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
399{ 383{
400 exit_idle();
401 irq_enter(); 384 irq_enter();
385 exit_idle();
402 inc_irq_stat(irq_thermal_count); 386 inc_irq_stat(irq_thermal_count);
403 smp_thermal_vector(); 387 smp_thermal_vector();
404 irq_exit(); 388 irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index d746df2909c9..aa578cadb940 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void smp_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle();
23 irq_enter(); 22 irq_enter();
23 exit_idle();
24 inc_irq_stat(irq_threshold_count); 24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector(); 25 mce_threshold_vector();
26 irq_exit(); 26 irq_exit();
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index a71efcdbb092..97b26356e9ee 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -547,6 +547,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
547 547
548 if (tmp != mask_lo) { 548 if (tmp != mask_lo) {
549 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); 549 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
550 add_taint(TAINT_FIRMWARE_WORKAROUND);
550 mask_lo = tmp; 551 mask_lo = tmp;
551 } 552 }
552 } 553 }
@@ -693,6 +694,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
693 694
694 /* Disable MTRRs, and set the default type to uncached */ 695 /* Disable MTRRs, and set the default type to uncached */
695 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); 696 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
697 wbinvd();
696} 698}
697 699
698static void post_set(void) __releases(set_atomicity_lock) 700static void post_set(void) __releases(set_atomicity_lock)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 640891014b2a..5adce1040b11 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -312,12 +312,8 @@ int x86_setup_perfctr(struct perf_event *event)
312 return -EOPNOTSUPP; 312 return -EOPNOTSUPP;
313 } 313 }
314 314
315 /*
316 * Do not allow config1 (extended registers) to propagate,
317 * there's no sane user-space generalization yet:
318 */
319 if (attr->type == PERF_TYPE_RAW) 315 if (attr->type == PERF_TYPE_RAW)
320 return 0; 316 return x86_pmu_extra_regs(event->attr.config, event);
321 317
322 if (attr->type == PERF_TYPE_HW_CACHE) 318 if (attr->type == PERF_TYPE_HW_CACHE)
323 return set_ext_hw_attr(hwc, event); 319 return set_ext_hw_attr(hwc, event);
@@ -488,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)
488 return event->pmu == &pmu; 484 return event->pmu == &pmu;
489} 485}
490 486
487/*
488 * Event scheduler state:
489 *
490 * Assign events iterating over all events and counters, beginning
491 * with events with least weights first. Keep the current iterator
492 * state in struct sched_state.
493 */
494struct sched_state {
495 int weight;
496 int event; /* event index */
497 int counter; /* counter index */
498 int unassigned; /* number of events to be assigned left */
499 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
500};
501
502/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
503#define SCHED_STATES_MAX 2
504
505struct perf_sched {
506 int max_weight;
507 int max_events;
508 struct event_constraint **constraints;
509 struct sched_state state;
510 int saved_states;
511 struct sched_state saved[SCHED_STATES_MAX];
512};
513
514/*
515 * Initialize interator that runs through all events and counters.
516 */
517static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
518 int num, int wmin, int wmax)
519{
520 int idx;
521
522 memset(sched, 0, sizeof(*sched));
523 sched->max_events = num;
524 sched->max_weight = wmax;
525 sched->constraints = c;
526
527 for (idx = 0; idx < num; idx++) {
528 if (c[idx]->weight == wmin)
529 break;
530 }
531
532 sched->state.event = idx; /* start with min weight */
533 sched->state.weight = wmin;
534 sched->state.unassigned = num;
535}
536
537static void perf_sched_save_state(struct perf_sched *sched)
538{
539 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
540 return;
541
542 sched->saved[sched->saved_states] = sched->state;
543 sched->saved_states++;
544}
545
546static bool perf_sched_restore_state(struct perf_sched *sched)
547{
548 if (!sched->saved_states)
549 return false;
550
551 sched->saved_states--;
552 sched->state = sched->saved[sched->saved_states];
553
554 /* continue with next counter: */
555 clear_bit(sched->state.counter++, sched->state.used);
556
557 return true;
558}
559
560/*
561 * Select a counter for the current event to schedule. Return true on
562 * success.
563 */
564static bool __perf_sched_find_counter(struct perf_sched *sched)
565{
566 struct event_constraint *c;
567 int idx;
568
569 if (!sched->state.unassigned)
570 return false;
571
572 if (sched->state.event >= sched->max_events)
573 return false;
574
575 c = sched->constraints[sched->state.event];
576
577 /* Prefer fixed purpose counters */
578 if (x86_pmu.num_counters_fixed) {
579 idx = X86_PMC_IDX_FIXED;
580 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
581 if (!__test_and_set_bit(idx, sched->state.used))
582 goto done;
583 }
584 }
585 /* Grab the first unused counter starting with idx */
586 idx = sched->state.counter;
587 for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
588 if (!__test_and_set_bit(idx, sched->state.used))
589 goto done;
590 }
591
592 return false;
593
594done:
595 sched->state.counter = idx;
596
597 if (c->overlap)
598 perf_sched_save_state(sched);
599
600 return true;
601}
602
603static bool perf_sched_find_counter(struct perf_sched *sched)
604{
605 while (!__perf_sched_find_counter(sched)) {
606 if (!perf_sched_restore_state(sched))
607 return false;
608 }
609
610 return true;
611}
612
613/*
614 * Go through all unassigned events and find the next one to schedule.
615 * Take events with the least weight first. Return true on success.
616 */
617static bool perf_sched_next_event(struct perf_sched *sched)
618{
619 struct event_constraint *c;
620
621 if (!sched->state.unassigned || !--sched->state.unassigned)
622 return false;
623
624 do {
625 /* next event */
626 sched->state.event++;
627 if (sched->state.event >= sched->max_events) {
628 /* next weight */
629 sched->state.event = 0;
630 sched->state.weight++;
631 if (sched->state.weight > sched->max_weight)
632 return false;
633 }
634 c = sched->constraints[sched->state.event];
635 } while (c->weight != sched->state.weight);
636
637 sched->state.counter = 0; /* start with first counter */
638
639 return true;
640}
641
642/*
643 * Assign a counter for each event.
644 */
645static int perf_assign_events(struct event_constraint **constraints, int n,
646 int wmin, int wmax, int *assign)
647{
648 struct perf_sched sched;
649
650 perf_sched_init(&sched, constraints, n, wmin, wmax);
651
652 do {
653 if (!perf_sched_find_counter(&sched))
654 break; /* failed */
655 if (assign)
656 assign[sched.state.event] = sched.state.counter;
657 } while (perf_sched_next_event(&sched));
658
659 return sched.state.unassigned;
660}
661
491int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) 662int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
492{ 663{
493 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; 664 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
494 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 665 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
495 int i, j, w, wmax, num = 0; 666 int i, wmin, wmax, num = 0;
496 struct hw_perf_event *hwc; 667 struct hw_perf_event *hwc;
497 668
498 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 669 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
499 670
500 for (i = 0; i < n; i++) { 671 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
501 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); 672 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
502 constraints[i] = c; 673 constraints[i] = c;
674 wmin = min(wmin, c->weight);
675 wmax = max(wmax, c->weight);
503 } 676 }
504 677
505 /* 678 /*
@@ -525,59 +698,11 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
525 if (assign) 698 if (assign)
526 assign[i] = hwc->idx; 699 assign[i] = hwc->idx;
527 } 700 }
528 if (i == n)
529 goto done;
530
531 /*
532 * begin slow path
533 */
534
535 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
536
537 /*
538 * weight = number of possible counters
539 *
540 * 1 = most constrained, only works on one counter
541 * wmax = least constrained, works on any counter
542 *
543 * assign events to counters starting with most
544 * constrained events.
545 */
546 wmax = x86_pmu.num_counters;
547
548 /*
549 * when fixed event counters are present,
550 * wmax is incremented by 1 to account
551 * for one more choice
552 */
553 if (x86_pmu.num_counters_fixed)
554 wmax++;
555
556 for (w = 1, num = n; num && w <= wmax; w++) {
557 /* for each event */
558 for (i = 0; num && i < n; i++) {
559 c = constraints[i];
560 hwc = &cpuc->event_list[i]->hw;
561
562 if (c->weight != w)
563 continue;
564 701
565 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { 702 /* slow path */
566 if (!test_bit(j, used_mask)) 703 if (i != n)
567 break; 704 num = perf_assign_events(constraints, n, wmin, wmax, assign);
568 }
569
570 if (j == X86_PMC_IDX_MAX)
571 break;
572 705
573 __set_bit(j, used_mask);
574
575 if (assign)
576 assign[i] = j;
577 num--;
578 }
579 }
580done:
581 /* 706 /*
582 * scheduling failed or is just a simulation, 707 * scheduling failed or is just a simulation,
583 * free resources if necessary 708 * free resources if necessary
@@ -588,7 +713,7 @@ done:
588 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); 713 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
589 } 714 }
590 } 715 }
591 return num ? -ENOSPC : 0; 716 return num ? -EINVAL : 0;
592} 717}
593 718
594/* 719/*
@@ -607,7 +732,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
607 732
608 if (is_x86_event(leader)) { 733 if (is_x86_event(leader)) {
609 if (n >= max_count) 734 if (n >= max_count)
610 return -ENOSPC; 735 return -EINVAL;
611 cpuc->event_list[n] = leader; 736 cpuc->event_list[n] = leader;
612 n++; 737 n++;
613 } 738 }
@@ -620,7 +745,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
620 continue; 745 continue;
621 746
622 if (n >= max_count) 747 if (n >= max_count)
623 return -ENOSPC; 748 return -EINVAL;
624 749
625 cpuc->event_list[n] = event; 750 cpuc->event_list[n] = event;
626 n++; 751 n++;
@@ -1123,6 +1248,7 @@ static void __init pmu_check_apic(void)
1123 1248
1124static int __init init_hw_perf_events(void) 1249static int __init init_hw_perf_events(void)
1125{ 1250{
1251 struct x86_pmu_quirk *quirk;
1126 struct event_constraint *c; 1252 struct event_constraint *c;
1127 int err; 1253 int err;
1128 1254
@@ -1151,8 +1277,8 @@ static int __init init_hw_perf_events(void)
1151 1277
1152 pr_cont("%s PMU driver.\n", x86_pmu.name); 1278 pr_cont("%s PMU driver.\n", x86_pmu.name);
1153 1279
1154 if (x86_pmu.quirks) 1280 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
1155 x86_pmu.quirks(); 1281 quirk->func();
1156 1282
1157 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 1283 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1158 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1284 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
@@ -1175,12 +1301,18 @@ static int __init init_hw_perf_events(void)
1175 1301
1176 unconstrained = (struct event_constraint) 1302 unconstrained = (struct event_constraint)
1177 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1303 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1178 0, x86_pmu.num_counters); 1304 0, x86_pmu.num_counters, 0);
1179 1305
1180 if (x86_pmu.event_constraints) { 1306 if (x86_pmu.event_constraints) {
1307 /*
1308 * event on fixed counter2 (REF_CYCLES) only works on this
1309 * counter, so do not extend mask to generic counters
1310 */
1181 for_each_event_constraint(c, x86_pmu.event_constraints) { 1311 for_each_event_constraint(c, x86_pmu.event_constraints) {
1182 if (c->cmask != X86_RAW_EVENT_MASK) 1312 if (c->cmask != X86_RAW_EVENT_MASK
1313 || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
1183 continue; 1314 continue;
1315 }
1184 1316
1185 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; 1317 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1186 c->weight += x86_pmu.num_counters; 1318 c->weight += x86_pmu.num_counters;
@@ -1316,7 +1448,7 @@ static int validate_event(struct perf_event *event)
1316 c = x86_pmu.get_event_constraints(fake_cpuc, event); 1448 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1317 1449
1318 if (!c || !c->weight) 1450 if (!c || !c->weight)
1319 ret = -ENOSPC; 1451 ret = -EINVAL;
1320 1452
1321 if (x86_pmu.put_event_constraints) 1453 if (x86_pmu.put_event_constraints)
1322 x86_pmu.put_event_constraints(fake_cpuc, event); 1454 x86_pmu.put_event_constraints(fake_cpuc, event);
@@ -1341,7 +1473,7 @@ static int validate_group(struct perf_event *event)
1341{ 1473{
1342 struct perf_event *leader = event->group_leader; 1474 struct perf_event *leader = event->group_leader;
1343 struct cpu_hw_events *fake_cpuc; 1475 struct cpu_hw_events *fake_cpuc;
1344 int ret = -ENOSPC, n; 1476 int ret = -EINVAL, n;
1345 1477
1346 fake_cpuc = allocate_fake_cpuc(); 1478 fake_cpuc = allocate_fake_cpuc();
1347 if (IS_ERR(fake_cpuc)) 1479 if (IS_ERR(fake_cpuc))
@@ -1570,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
1570 1702
1571 return misc; 1703 return misc;
1572} 1704}
1705
1706void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
1707{
1708 cap->version = x86_pmu.version;
1709 cap->num_counters_gp = x86_pmu.num_counters;
1710 cap->num_counters_fixed = x86_pmu.num_counters_fixed;
1711 cap->bit_width_gp = x86_pmu.cntval_bits;
1712 cap->bit_width_fixed = x86_pmu.cntval_bits;
1713 cap->events_mask = (unsigned int)x86_pmu.events_maskl;
1714 cap->events_mask_len = x86_pmu.events_mask_len;
1715}
1716EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index b9698d40ac4b..c30c807ddc72 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -45,6 +45,7 @@ struct event_constraint {
45 u64 code; 45 u64 code;
46 u64 cmask; 46 u64 cmask;
47 int weight; 47 int weight;
48 int overlap;
48}; 49};
49 50
50struct amd_nb { 51struct amd_nb {
@@ -146,20 +147,47 @@ struct cpu_hw_events {
146 /* 147 /*
147 * AMD specific bits 148 * AMD specific bits
148 */ 149 */
149 struct amd_nb *amd_nb; 150 struct amd_nb *amd_nb;
151 /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
152 u64 perf_ctr_virt_mask;
150 153
151 void *kfree_on_online; 154 void *kfree_on_online;
152}; 155};
153 156
154#define __EVENT_CONSTRAINT(c, n, m, w) {\ 157#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
155 { .idxmsk64 = (n) }, \ 158 { .idxmsk64 = (n) }, \
156 .code = (c), \ 159 .code = (c), \
157 .cmask = (m), \ 160 .cmask = (m), \
158 .weight = (w), \ 161 .weight = (w), \
162 .overlap = (o), \
159} 163}
160 164
161#define EVENT_CONSTRAINT(c, n, m) \ 165#define EVENT_CONSTRAINT(c, n, m) \
162 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 166 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
167
168/*
169 * The overlap flag marks event constraints with overlapping counter
170 * masks. This is the case if the counter mask of such an event is not
171 * a subset of any other counter mask of a constraint with an equal or
172 * higher weight, e.g.:
173 *
174 * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
175 * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
176 * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
177 *
178 * The event scheduler may not select the correct counter in the first
179 * cycle because it needs to know which subsequent events will be
180 * scheduled. It may fail to schedule the events then. So we set the
181 * overlap flag for such constraints to give the scheduler a hint which
182 * events to select for counter rescheduling.
183 *
184 * Care must be taken as the rescheduling algorithm is O(n!) which
185 * will increase scheduling cycles for an over-commited system
186 * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
187 * and its counter masks must be kept at a minimum.
188 */
189#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
190 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
163 191
164/* 192/*
165 * Constraint on the Event code. 193 * Constraint on the Event code.
@@ -235,6 +263,11 @@ union perf_capabilities {
235 u64 capabilities; 263 u64 capabilities;
236}; 264};
237 265
266struct x86_pmu_quirk {
267 struct x86_pmu_quirk *next;
268 void (*func)(void);
269};
270
238/* 271/*
239 * struct x86_pmu - generic x86 pmu 272 * struct x86_pmu - generic x86 pmu
240 */ 273 */
@@ -259,6 +292,11 @@ struct x86_pmu {
259 int num_counters_fixed; 292 int num_counters_fixed;
260 int cntval_bits; 293 int cntval_bits;
261 u64 cntval_mask; 294 u64 cntval_mask;
295 union {
296 unsigned long events_maskl;
297 unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
298 };
299 int events_mask_len;
262 int apic; 300 int apic;
263 u64 max_period; 301 u64 max_period;
264 struct event_constraint * 302 struct event_constraint *
@@ -268,7 +306,7 @@ struct x86_pmu {
268 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 306 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
269 struct perf_event *event); 307 struct perf_event *event);
270 struct event_constraint *event_constraints; 308 struct event_constraint *event_constraints;
271 void (*quirks)(void); 309 struct x86_pmu_quirk *quirks;
272 int perfctr_second_write; 310 int perfctr_second_write;
273 311
274 int (*cpu_prepare)(int cpu); 312 int (*cpu_prepare)(int cpu);
@@ -309,6 +347,15 @@ struct x86_pmu {
309 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); 347 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
310}; 348};
311 349
350#define x86_add_quirk(func_) \
351do { \
352 static struct x86_pmu_quirk __quirk __initdata = { \
353 .func = func_, \
354 }; \
355 __quirk.next = x86_pmu.quirks; \
356 x86_pmu.quirks = &__quirk; \
357} while (0)
358
312#define ERF_NO_HT_SHARING 1 359#define ERF_NO_HT_SHARING 1
313#define ERF_HAS_RSP_1 2 360#define ERF_HAS_RSP_1 2
314 361
@@ -372,9 +419,11 @@ void x86_pmu_disable_all(void);
372static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, 419static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
373 u64 enable_mask) 420 u64 enable_mask)
374{ 421{
422 u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
423
375 if (hwc->extra_reg.reg) 424 if (hwc->extra_reg.reg)
376 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); 425 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
377 wrmsrl(hwc->config_base, hwc->config | enable_mask); 426 wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
378} 427}
379 428
380void x86_pmu_enable_all(int added); 429void x86_pmu_enable_all(int added);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index aeefd45697a2..67250a52430b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,4 +1,5 @@
1#include <linux/perf_event.h> 1#include <linux/perf_event.h>
2#include <linux/export.h>
2#include <linux/types.h> 3#include <linux/types.h>
3#include <linux/init.h> 4#include <linux/init.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
@@ -357,7 +358,9 @@ static void amd_pmu_cpu_starting(int cpu)
357 struct amd_nb *nb; 358 struct amd_nb *nb;
358 int i, nb_id; 359 int i, nb_id;
359 360
360 if (boot_cpu_data.x86_max_cores < 2) 361 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
362
363 if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)
361 return; 364 return;
362 365
363 nb_id = amd_get_nb_id(cpu); 366 nb_id = amd_get_nb_id(cpu);
@@ -492,7 +495,7 @@ static __initconst const struct x86_pmu amd_pmu = {
492static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); 495static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
493static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); 496static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
494static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); 497static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
495static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); 498static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
496static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); 499static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
497static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); 500static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
498 501
@@ -587,9 +590,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {
587 .put_event_constraints = amd_put_event_constraints, 590 .put_event_constraints = amd_put_event_constraints,
588 591
589 .cpu_prepare = amd_pmu_cpu_prepare, 592 .cpu_prepare = amd_pmu_cpu_prepare,
590 .cpu_starting = amd_pmu_cpu_starting,
591 .cpu_dead = amd_pmu_cpu_dead, 593 .cpu_dead = amd_pmu_cpu_dead,
592#endif 594#endif
595 .cpu_starting = amd_pmu_cpu_starting,
593}; 596};
594 597
595__init int amd_pmu_init(void) 598__init int amd_pmu_init(void)
@@ -621,3 +624,33 @@ __init int amd_pmu_init(void)
621 624
622 return 0; 625 return 0;
623} 626}
627
628void amd_pmu_enable_virt(void)
629{
630 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
631
632 cpuc->perf_ctr_virt_mask = 0;
633
634 /* Reload all events */
635 x86_pmu_disable_all();
636 x86_pmu_enable_all(0);
637}
638EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
639
640void amd_pmu_disable_virt(void)
641{
642 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
643
644 /*
645 * We only mask out the Host-only bit so that host-only counting works
646 * when SVM is disabled. If someone sets up a guest-only counter when
647 * SVM is disabled the Guest-only bits still gets set and the counter
648 * will not count anything.
649 */
650 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
651
652 /* Reload all events */
653 x86_pmu_disable_all();
654 x86_pmu_enable_all(0);
655}
656EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index ab6343d21825..3b8a2d30d14e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -199,8 +199,7 @@ static int force_ibs_eilvt_setup(void)
199 goto out; 199 goto out;
200 } 200 }
201 201
202 pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); 202 pr_info("IBS: LVT offset %d assigned\n", offset);
203 pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
204 203
205 return 0; 204 return 0;
206out: 205out:
@@ -265,19 +264,23 @@ perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *h
265static __init int amd_ibs_init(void) 264static __init int amd_ibs_init(void)
266{ 265{
267 u32 caps; 266 u32 caps;
268 int ret; 267 int ret = -EINVAL;
269 268
270 caps = __get_ibs_caps(); 269 caps = __get_ibs_caps();
271 if (!caps) 270 if (!caps)
272 return -ENODEV; /* ibs not supported by the cpu */ 271 return -ENODEV; /* ibs not supported by the cpu */
273 272
274 if (!ibs_eilvt_valid()) { 273 /*
275 ret = force_ibs_eilvt_setup(); 274 * Force LVT offset assignment for family 10h: The offsets are
276 if (ret) { 275 * not assigned by the BIOS for this family, so the OS is
277 pr_err("Failed to setup IBS, %d\n", ret); 276 * responsible for doing it. If the OS assignment fails, fall
278 return ret; 277 * back to BIOS settings and try to setup this.
279 } 278 */
280 } 279 if (boot_cpu_data.x86 == 0x10)
280 force_ibs_eilvt_setup();
281
282 if (!ibs_eilvt_valid())
283 goto out;
281 284
282 get_online_cpus(); 285 get_online_cpus();
283 ibs_caps = caps; 286 ibs_caps = caps;
@@ -287,7 +290,11 @@ static __init int amd_ibs_init(void)
287 smp_call_function(setup_APIC_ibs, NULL, 1); 290 smp_call_function(setup_APIC_ibs, NULL, 1);
288 put_online_cpus(); 291 put_online_cpus();
289 292
290 return perf_event_ibs_init(); 293 ret = perf_event_ibs_init();
294out:
295 if (ret)
296 pr_err("Failed to setup IBS, %d\n", ret);
297 return ret;
291} 298}
292 299
293/* Since we need the pci subsystem to init ibs we can't do this earlier: */ 300/* Since we need the pci subsystem to init ibs we can't do this earlier: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2be5ebe99872..3bd37bdf1b8e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 28 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 29 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 30 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
31 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
31}; 32};
32 33
33static struct event_constraint intel_core_event_constraints[] __read_mostly = 34static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =
45{ 46{
46 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 47 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
47 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 48 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
48 /* 49 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
49 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
50 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
51 * ratio between these counters.
52 */
53 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
54 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ 50 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
55 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 51 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
56 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 52 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
68{ 64{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 65 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 66 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 67 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ 68 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
73 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ 69 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
74 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ 70 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly
90{ 86{
91 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 87 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
92 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 88 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
93 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 89 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
94 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ 90 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
95 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ 91 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
96 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ 92 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
102{ 98{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 99 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 100 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
105 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 101 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
106 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ 102 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
107 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 103 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
108 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 104 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
125{ 121{
126 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 122 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
127 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 123 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
128 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ 124 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
129 EVENT_CONSTRAINT_END 125 EVENT_CONSTRAINT_END
130}; 126};
131 127
@@ -1169,7 +1165,7 @@ again:
1169 */ 1165 */
1170 c = &unconstrained; 1166 c = &unconstrained;
1171 } else if (intel_try_alt_er(event, orig_idx)) { 1167 } else if (intel_try_alt_er(event, orig_idx)) {
1172 raw_spin_unlock(&era->lock); 1168 raw_spin_unlock_irqrestore(&era->lock, flags);
1173 goto again; 1169 goto again;
1174 } 1170 }
1175 raw_spin_unlock_irqrestore(&era->lock, flags); 1171 raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1519 .guest_get_msrs = intel_guest_get_msrs, 1515 .guest_get_msrs = intel_guest_get_msrs,
1520}; 1516};
1521 1517
1522static void intel_clovertown_quirks(void) 1518static __init void intel_clovertown_quirk(void)
1523{ 1519{
1524 /* 1520 /*
1525 * PEBS is unreliable due to: 1521 * PEBS is unreliable due to:
@@ -1545,12 +1541,60 @@ static void intel_clovertown_quirks(void)
1545 x86_pmu.pebs_constraints = NULL; 1541 x86_pmu.pebs_constraints = NULL;
1546} 1542}
1547 1543
1544static __init void intel_sandybridge_quirk(void)
1545{
1546 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
1547 x86_pmu.pebs = 0;
1548 x86_pmu.pebs_constraints = NULL;
1549}
1550
1551static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
1552 { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
1553 { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
1554 { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
1555 { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
1556 { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
1557 { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
1558 { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
1559};
1560
1561static __init void intel_arch_events_quirk(void)
1562{
1563 int bit;
1564
1565 /* disable event that reported as not presend by cpuid */
1566 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
1567 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
1568 printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
1569 intel_arch_events_map[bit].name);
1570 }
1571}
1572
1573static __init void intel_nehalem_quirk(void)
1574{
1575 union cpuid10_ebx ebx;
1576
1577 ebx.full = x86_pmu.events_maskl;
1578 if (ebx.split.no_branch_misses_retired) {
1579 /*
1580 * Erratum AAJ80 detected, we work it around by using
1581 * the BR_MISP_EXEC.ANY event. This will over-count
1582 * branch-misses, but it's still much better than the
1583 * architectural event which is often completely bogus:
1584 */
1585 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1586 ebx.split.no_branch_misses_retired = 0;
1587 x86_pmu.events_maskl = ebx.full;
1588 printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
1589 }
1590}
1591
1548__init int intel_pmu_init(void) 1592__init int intel_pmu_init(void)
1549{ 1593{
1550 union cpuid10_edx edx; 1594 union cpuid10_edx edx;
1551 union cpuid10_eax eax; 1595 union cpuid10_eax eax;
1596 union cpuid10_ebx ebx;
1552 unsigned int unused; 1597 unsigned int unused;
1553 unsigned int ebx;
1554 int version; 1598 int version;
1555 1599
1556 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 1600 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -1567,8 +1611,8 @@ __init int intel_pmu_init(void)
1567 * Check whether the Architectural PerfMon supports 1611 * Check whether the Architectural PerfMon supports
1568 * Branch Misses Retired hw_event or not. 1612 * Branch Misses Retired hw_event or not.
1569 */ 1613 */
1570 cpuid(10, &eax.full, &ebx, &unused, &edx.full); 1614 cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
1571 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) 1615 if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
1572 return -ENODEV; 1616 return -ENODEV;
1573 1617
1574 version = eax.split.version_id; 1618 version = eax.split.version_id;
@@ -1582,6 +1626,9 @@ __init int intel_pmu_init(void)
1582 x86_pmu.cntval_bits = eax.split.bit_width; 1626 x86_pmu.cntval_bits = eax.split.bit_width;
1583 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; 1627 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
1584 1628
1629 x86_pmu.events_maskl = ebx.full;
1630 x86_pmu.events_mask_len = eax.split.mask_length;
1631
1585 /* 1632 /*
1586 * Quirk: v2 perfmon does not report fixed-purpose events, so 1633 * Quirk: v2 perfmon does not report fixed-purpose events, so
1587 * assume at least 3 events: 1634 * assume at least 3 events:
@@ -1601,6 +1648,8 @@ __init int intel_pmu_init(void)
1601 1648
1602 intel_ds_init(); 1649 intel_ds_init();
1603 1650
1651 x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
1652
1604 /* 1653 /*
1605 * Install the hw-cache-events table: 1654 * Install the hw-cache-events table:
1606 */ 1655 */
@@ -1610,7 +1659,7 @@ __init int intel_pmu_init(void)
1610 break; 1659 break;
1611 1660
1612 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1661 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1613 x86_pmu.quirks = intel_clovertown_quirks; 1662 x86_add_quirk(intel_clovertown_quirk);
1614 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1663 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1615 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1664 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1616 case 29: /* six-core 45 nm xeon "Dunnington" */ 1665 case 29: /* six-core 45 nm xeon "Dunnington" */
@@ -1644,17 +1693,8 @@ __init int intel_pmu_init(void)
1644 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1693 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1645 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; 1694 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1646 1695
1647 if (ebx & 0x40) { 1696 x86_add_quirk(intel_nehalem_quirk);
1648 /*
1649 * Erratum AAJ80 detected, we work it around by using
1650 * the BR_MISP_EXEC.ANY event. This will over-count
1651 * branch-misses, but it's still much better than the
1652 * architectural event which is often completely bogus:
1653 */
1654 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1655 1697
1656 pr_cont("erratum AAJ80 worked around, ");
1657 }
1658 pr_cont("Nehalem events, "); 1698 pr_cont("Nehalem events, ");
1659 break; 1699 break;
1660 1700
@@ -1694,6 +1734,7 @@ __init int intel_pmu_init(void)
1694 break; 1734 break;
1695 1735
1696 case 42: /* SandyBridge */ 1736 case 42: /* SandyBridge */
1737 x86_add_quirk(intel_sandybridge_quirk);
1697 case 45: /* SandyBridge, "Romely-EP" */ 1738 case 45: /* SandyBridge, "Romely-EP" */
1698 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1739 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1699 sizeof(hw_cache_event_ids)); 1740 sizeof(hw_cache_event_ids));
@@ -1730,5 +1771,6 @@ __init int intel_pmu_init(void)
1730 break; 1771 break;
1731 } 1772 }
1732 } 1773 }
1774
1733 return 0; 1775 return 0;
1734} 1776}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index c0d238f49db8..d6bd49faa40c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -439,7 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)
439 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 439 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
440 440
441 cpuc->pebs_enabled |= 1ULL << hwc->idx; 441 cpuc->pebs_enabled |= 1ULL << hwc->idx;
442 WARN_ON_ONCE(cpuc->enabled);
443 442
444 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) 443 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
445 intel_pmu_lbr_enable(event); 444 intel_pmu_lbr_enable(event);
@@ -493,6 +492,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
493 unsigned long from = cpuc->lbr_entries[0].from; 492 unsigned long from = cpuc->lbr_entries[0].from;
494 unsigned long old_to, to = cpuc->lbr_entries[0].to; 493 unsigned long old_to, to = cpuc->lbr_entries[0].to;
495 unsigned long ip = regs->ip; 494 unsigned long ip = regs->ip;
495 int is_64bit = 0;
496 496
497 /* 497 /*
498 * We don't need to fixup if the PEBS assist is fault like 498 * We don't need to fixup if the PEBS assist is fault like
@@ -544,7 +544,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
544 } else 544 } else
545 kaddr = (void *)to; 545 kaddr = (void *)to;
546 546
547 kernel_insn_init(&insn, kaddr); 547#ifdef CONFIG_X86_64
548 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
549#endif
550 insn_init(&insn, kaddr, is_64bit);
548 insn_get_length(&insn); 551 insn_get_length(&insn);
549 to += insn.length; 552 to += insn.length;
550 } while (to < ip); 553 } while (to < ip);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 3fab3de3ce96..47a7e63bfe54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -72,8 +72,6 @@ void intel_pmu_lbr_enable(struct perf_event *event)
72 if (!x86_pmu.lbr_nr) 72 if (!x86_pmu.lbr_nr)
73 return; 73 return;
74 74
75 WARN_ON_ONCE(cpuc->enabled);
76
77 /* 75 /*
78 * Reset the LBR stack if we changed task context to 76 * Reset the LBR stack if we changed task context to
79 * avoid data leaks. 77 * avoid data leaks.
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 492bf1358a7c..ef484d9d0a25 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1268,7 +1268,7 @@ reserve:
1268 } 1268 }
1269 1269
1270done: 1270done:
1271 return num ? -ENOSPC : 0; 1271 return num ? -EINVAL : 0;
1272} 1272}
1273 1273
1274static __initconst const struct x86_pmu p4_pmu = { 1274static __initconst const struct x86_pmu p4_pmu = {
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..7b3fe56b1c21 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {
16 "100mhzsteps", 16 "100mhzsteps",
17 "hwpstate", 17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */ 18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */ 19 "cpb", /* core performance boost */
20 "eff_freq_ro", /* Readonly aperf/mperf */
20}; 21};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 14b23140e81f..8022c6681485 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
64static int show_cpuinfo(struct seq_file *m, void *v) 64static int show_cpuinfo(struct seq_file *m, void *v)
65{ 65{
66 struct cpuinfo_x86 *c = v; 66 struct cpuinfo_x86 *c = v;
67 unsigned int cpu = 0; 67 unsigned int cpu;
68 int i; 68 int i;
69 69
70#ifdef CONFIG_SMP
71 cpu = c->cpu_index; 70 cpu = c->cpu_index;
72#endif
73 seq_printf(m, "processor\t: %u\n" 71 seq_printf(m, "processor\t: %u\n"
74 "vendor_id\t: %s\n" 72 "vendor_id\t: %s\n"
75 "cpu family\t: %d\n" 73 "cpu family\t: %d\n"
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a42527c..a524353d93f2 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
177 .notifier_call = cpuid_class_cpu_callback, 177 .notifier_call = cpuid_class_cpu_callback,
178}; 178};
179 179
180static char *cpuid_devnode(struct device *dev, mode_t *mode) 180static char *cpuid_devnode(struct device *dev, umode_t *mode)
181{ 181{
182 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 182 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
183} 183}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 1aae78f775fc..4025fe4f928f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
252 unsigned short ss; 252 unsigned short ss;
253 unsigned long sp; 253 unsigned long sp;
254#endif 254#endif
255 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); 255 printk(KERN_DEFAULT
256 "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
256#ifdef CONFIG_PREEMPT 257#ifdef CONFIG_PREEMPT
257 printk("PREEMPT "); 258 printk("PREEMPT ");
258#endif 259#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 3b97a80ce329..c99f9ed013d5 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -116,16 +116,16 @@ void show_registers(struct pt_regs *regs)
116 for (i = 0; i < code_len; i++, ip++) { 116 for (i = 0; i < code_len; i++, ip++) {
117 if (ip < (u8 *)PAGE_OFFSET || 117 if (ip < (u8 *)PAGE_OFFSET ||
118 probe_kernel_address(ip, c)) { 118 probe_kernel_address(ip, c)) {
119 printk(" Bad EIP value."); 119 printk(KERN_CONT " Bad EIP value.");
120 break; 120 break;
121 } 121 }
122 if (ip == (u8 *)regs->ip) 122 if (ip == (u8 *)regs->ip)
123 printk("<%02x> ", c); 123 printk(KERN_CONT "<%02x> ", c);
124 else 124 else
125 printk("%02x ", c); 125 printk(KERN_CONT "%02x ", c);
126 } 126 }
127 } 127 }
128 printk("\n"); 128 printk(KERN_CONT "\n");
129} 129}
130 130
131int is_valid_bugaddr(unsigned long ip) 131int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 19853ad8afc5..17107bd6e1f0 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
129 if (!stack) { 129 if (!stack) {
130 if (regs) 130 if (regs)
131 stack = (unsigned long *)regs->sp; 131 stack = (unsigned long *)regs->sp;
132 else if (task && task != current) 132 else if (task != current)
133 stack = (unsigned long *)task->thread.sp; 133 stack = (unsigned long *)task->thread.sp;
134 else 134 else
135 stack = &dummy; 135 stack = &dummy;
@@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs)
269 unsigned char c; 269 unsigned char c;
270 u8 *ip; 270 u8 *ip;
271 271
272 printk(KERN_EMERG "Stack:\n"); 272 printk(KERN_DEFAULT "Stack:\n");
273 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 273 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
274 0, KERN_EMERG); 274 0, KERN_DEFAULT);
275 275
276 printk(KERN_EMERG "Code: "); 276 printk(KERN_DEFAULT "Code: ");
277 277
278 ip = (u8 *)regs->ip - code_prologue; 278 ip = (u8 *)regs->ip - code_prologue;
279 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 279 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -284,16 +284,16 @@ void show_registers(struct pt_regs *regs)
284 for (i = 0; i < code_len; i++, ip++) { 284 for (i = 0; i < code_len; i++, ip++) {
285 if (ip < (u8 *)PAGE_OFFSET || 285 if (ip < (u8 *)PAGE_OFFSET ||
286 probe_kernel_address(ip, c)) { 286 probe_kernel_address(ip, c)) {
287 printk(" Bad RIP value."); 287 printk(KERN_CONT " Bad RIP value.");
288 break; 288 break;
289 } 289 }
290 if (ip == (u8 *)regs->ip) 290 if (ip == (u8 *)regs->ip)
291 printk("<%02x> ", c); 291 printk(KERN_CONT "<%02x> ", c);
292 else 292 else
293 printk("%02x ", c); 293 printk(KERN_CONT "%02x ", c);
294 } 294 }
295 } 295 }
296 printk("\n"); 296 printk(KERN_CONT "\n");
297} 297}
298 298
299int is_valid_bugaddr(unsigned long ip) 299int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..62d61e9976eb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
19#include <linux/acpi.h> 19#include <linux/acpi.h>
20#include <linux/firmware-map.h> 20#include <linux/firmware-map.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22#include <linux/sort.h>
22 23
23#include <asm/e820.h> 24#include <asm/e820.h>
24#include <asm/proto.h> 25#include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
227 * ____________________33__ 228 * ____________________33__
228 * ______________________4_ 229 * ______________________4_
229 */ 230 */
231struct change_member {
232 struct e820entry *pbios; /* pointer to original bios entry */
233 unsigned long long addr; /* address for this change point */
234};
235
236static int __init cpcompare(const void *a, const void *b)
237{
238 struct change_member * const *app = a, * const *bpp = b;
239 const struct change_member *ap = *app, *bp = *bpp;
240
241 /*
242 * Inputs are pointers to two elements of change_point[]. If their
243 * addresses are unequal, their difference dominates. If the addresses
244 * are equal, then consider one that represents the end of its region
245 * to be greater than one that does not.
246 */
247 if (ap->addr != bp->addr)
248 return ap->addr > bp->addr ? 1 : -1;
249
250 return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
251}
230 252
231int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, 253int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
232 u32 *pnr_map) 254 u32 *pnr_map)
233{ 255{
234 struct change_member {
235 struct e820entry *pbios; /* pointer to original bios entry */
236 unsigned long long addr; /* address for this change point */
237 };
238 static struct change_member change_point_list[2*E820_X_MAX] __initdata; 256 static struct change_member change_point_list[2*E820_X_MAX] __initdata;
239 static struct change_member *change_point[2*E820_X_MAX] __initdata; 257 static struct change_member *change_point[2*E820_X_MAX] __initdata;
240 static struct e820entry *overlap_list[E820_X_MAX] __initdata; 258 static struct e820entry *overlap_list[E820_X_MAX] __initdata;
241 static struct e820entry new_bios[E820_X_MAX] __initdata; 259 static struct e820entry new_bios[E820_X_MAX] __initdata;
242 struct change_member *change_tmp;
243 unsigned long current_type, last_type; 260 unsigned long current_type, last_type;
244 unsigned long long last_addr; 261 unsigned long long last_addr;
245 int chgidx, still_changing; 262 int chgidx;
246 int overlap_entries; 263 int overlap_entries;
247 int new_bios_entry; 264 int new_bios_entry;
248 int old_nr, new_nr, chg_nr; 265 int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
279 chg_nr = chgidx; 296 chg_nr = chgidx;
280 297
281 /* sort change-point list by memory addresses (low -> high) */ 298 /* sort change-point list by memory addresses (low -> high) */
282 still_changing = 1; 299 sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
283 while (still_changing) {
284 still_changing = 0;
285 for (i = 1; i < chg_nr; i++) {
286 unsigned long long curaddr, lastaddr;
287 unsigned long long curpbaddr, lastpbaddr;
288
289 curaddr = change_point[i]->addr;
290 lastaddr = change_point[i - 1]->addr;
291 curpbaddr = change_point[i]->pbios->addr;
292 lastpbaddr = change_point[i - 1]->pbios->addr;
293
294 /*
295 * swap entries, when:
296 *
297 * curaddr > lastaddr or
298 * curaddr == lastaddr and curaddr == curpbaddr and
299 * lastaddr != lastpbaddr
300 */
301 if (curaddr < lastaddr ||
302 (curaddr == lastaddr && curaddr == curpbaddr &&
303 lastaddr != lastpbaddr)) {
304 change_tmp = change_point[i];
305 change_point[i] = change_point[i-1];
306 change_point[i-1] = change_tmp;
307 still_changing = 1;
308 }
309 }
310 }
311 300
312 /* create a new bios memory map, removing overlaps */ 301 /* create a new bios memory map, removing overlaps */
313 overlap_entries = 0; /* number of entries in the overlap table */ 302 overlap_entries = 0; /* number of entries in the overlap table */
@@ -714,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
714} 703}
715#endif 704#endif
716 705
717#ifdef CONFIG_HIBERNATION 706#ifdef CONFIG_ACPI
718/** 707/**
719 * Mark ACPI NVS memory region, so that we can save/restore it during 708 * Mark ACPI NVS memory region, so that we can save/restore it during
720 * hibernation and the subsequent resume. 709 * hibernation and the subsequent resume.
@@ -727,7 +716,7 @@ static int __init e820_mark_nvs_memory(void)
727 struct e820entry *ei = &e820.map[i]; 716 struct e820entry *ei = &e820.map[i];
728 717
729 if (ei->type == E820_NVS) 718 if (ei->type == E820_NVS)
730 suspend_nvs_register(ei->addr, ei->size); 719 acpi_nvs_register(ei->addr, ei->size);
731 } 720 }
732 721
733 return 0; 722 return 0;
@@ -738,35 +727,17 @@ core_initcall(e820_mark_nvs_memory);
738/* 727/*
739 * pre allocated 4k and reserved it in memblock and e820_saved 728 * pre allocated 4k and reserved it in memblock and e820_saved
740 */ 729 */
741u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) 730u64 __init early_reserve_e820(u64 size, u64 align)
742{ 731{
743 u64 size = 0;
744 u64 addr; 732 u64 addr;
745 u64 start;
746 733
747 for (start = startt; ; start += size) { 734 addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
748 start = memblock_x86_find_in_range_size(start, &size, align); 735 if (addr) {
749 if (start == MEMBLOCK_ERROR) 736 e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
750 return 0; 737 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
751 if (size >= sizet) 738 update_e820_saved();
752 break;
753 } 739 }
754 740
755#ifdef CONFIG_X86_32
756 if (start >= MAXMEM)
757 return 0;
758 if (start + size > MAXMEM)
759 size = MAXMEM - start;
760#endif
761
762 addr = round_down(start + size - sizet, align);
763 if (addr < start)
764 return 0;
765 memblock_x86_reserve_range(addr, addr + sizet, "new next");
766 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
767 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
768 update_e820_saved();
769
770 return addr; 741 return addr;
771} 742}
772 743
@@ -1090,7 +1061,7 @@ void __init memblock_x86_fill(void)
1090 * We are safe to enable resizing, beause memblock_x86_fill() 1061 * We are safe to enable resizing, beause memblock_x86_fill()
1091 * is rather later for x86 1062 * is rather later for x86
1092 */ 1063 */
1093 memblock_can_resize = 1; 1064 memblock_allow_resize();
1094 1065
1095 for (i = 0; i < e820.nr_map; i++) { 1066 for (i = 0; i < e820.nr_map; i++) {
1096 struct e820entry *ei = &e820.map[i]; 1067 struct e820entry *ei = &e820.map[i];
@@ -1105,22 +1076,36 @@ void __init memblock_x86_fill(void)
1105 memblock_add(ei->addr, ei->size); 1076 memblock_add(ei->addr, ei->size);
1106 } 1077 }
1107 1078
1108 memblock_analyze();
1109 memblock_dump_all(); 1079 memblock_dump_all();
1110} 1080}
1111 1081
1112void __init memblock_find_dma_reserve(void) 1082void __init memblock_find_dma_reserve(void)
1113{ 1083{
1114#ifdef CONFIG_X86_64 1084#ifdef CONFIG_X86_64
1115 u64 free_size_pfn; 1085 u64 nr_pages = 0, nr_free_pages = 0;
1116 u64 mem_size_pfn; 1086 unsigned long start_pfn, end_pfn;
1087 phys_addr_t start, end;
1088 int i;
1089 u64 u;
1090
1117 /* 1091 /*
1118 * need to find out used area below MAX_DMA_PFN 1092 * need to find out used area below MAX_DMA_PFN
1119 * need to use memblock to get free size in [0, MAX_DMA_PFN] 1093 * need to use memblock to get free size in [0, MAX_DMA_PFN]
1120 * at first, and assume boot_mem will not take below MAX_DMA_PFN 1094 * at first, and assume boot_mem will not take below MAX_DMA_PFN
1121 */ 1095 */
1122 mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; 1096 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
1123 free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; 1097 start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
1124 set_dma_reserve(mem_size_pfn - free_size_pfn); 1098 end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
1099 nr_pages += end_pfn - start_pfn;
1100 }
1101
1102 for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
1103 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1104 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1105 if (start_pfn < end_pfn)
1106 nr_free_pages += end_pfn - start_pfn;
1107 }
1108
1109 set_dma_reserve(nr_pages - nr_free_pages);
1125#endif 1110#endif
1126} 1111}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..9b9f18b49918 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf)
240 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
241 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
242#endif 242#endif
243#ifdef CONFIG_EARLY_PRINTK_MRST 243#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
244 if (!strncmp(buf, "mrst", 4)) { 244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init(); 245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep); 246 early_console_register(&early_mrst_console, keep);
247 } 247 }
248 248
249 if (!strncmp(buf, "hsu", 3)) { 249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init(); 250 hsu_early_console_init(buf + 3);
251 early_console_register(&early_hsu_console, keep); 251 early_console_register(&early_hsu_console, keep);
252 } 252 }
253#endif 253#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f3f6f5344001..79d97e68f042 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,6 +42,7 @@
42 */ 42 */
43 43
44#include <linux/linkage.h> 44#include <linux/linkage.h>
45#include <linux/err.h>
45#include <asm/thread_info.h> 46#include <asm/thread_info.h>
46#include <asm/irqflags.h> 47#include <asm/irqflags.h>
47#include <asm/errno.h> 48#include <asm/errno.h>
@@ -81,8 +82,6 @@
81 * enough to patch inline, increasing performance. 82 * enough to patch inline, increasing performance.
82 */ 83 */
83 84
84#define nr_syscalls ((syscall_table_size)/4)
85
86#ifdef CONFIG_PREEMPT 85#ifdef CONFIG_PREEMPT
87#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 86#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 87#else
@@ -423,7 +422,7 @@ sysenter_past_esp:
423 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 422 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
424 jnz sysenter_audit 423 jnz sysenter_audit
425sysenter_do_call: 424sysenter_do_call:
426 cmpl $(nr_syscalls), %eax 425 cmpl $(NR_syscalls), %eax
427 jae syscall_badsys 426 jae syscall_badsys
428 call *sys_call_table(,%eax,4) 427 call *sys_call_table(,%eax,4)
429 movl %eax,PT_EAX(%esp) 428 movl %eax,PT_EAX(%esp)
@@ -455,7 +454,7 @@ sysenter_audit:
455 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ 454 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
456 movl %eax,%edx /* 2nd arg: syscall number */ 455 movl %eax,%edx /* 2nd arg: syscall number */
457 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 456 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
458 call audit_syscall_entry 457 call __audit_syscall_entry
459 pushl_cfi %ebx 458 pushl_cfi %ebx
460 movl PT_EAX(%esp),%eax /* reload syscall number */ 459 movl PT_EAX(%esp),%eax /* reload syscall number */
461 jmp sysenter_do_call 460 jmp sysenter_do_call
@@ -466,11 +465,10 @@ sysexit_audit:
466 TRACE_IRQS_ON 465 TRACE_IRQS_ON
467 ENABLE_INTERRUPTS(CLBR_ANY) 466 ENABLE_INTERRUPTS(CLBR_ANY)
468 movl %eax,%edx /* second arg, syscall return value */ 467 movl %eax,%edx /* second arg, syscall return value */
469 cmpl $0,%eax /* is it < 0? */ 468 cmpl $-MAX_ERRNO,%eax /* is it an error ? */
470 setl %al /* 1 if so, 0 if not */ 469 setbe %al /* 1 if so, 0 if not */
471 movzbl %al,%eax /* zero-extend that */ 470 movzbl %al,%eax /* zero-extend that */
472 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 471 call __audit_syscall_exit
473 call audit_syscall_exit
474 DISABLE_INTERRUPTS(CLBR_ANY) 472 DISABLE_INTERRUPTS(CLBR_ANY)
475 TRACE_IRQS_OFF 473 TRACE_IRQS_OFF
476 movl TI_flags(%ebp), %ecx 474 movl TI_flags(%ebp), %ecx
@@ -504,7 +502,7 @@ ENTRY(system_call)
504 # system call tracing in operation / emulation 502 # system call tracing in operation / emulation
505 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 503 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
506 jnz syscall_trace_entry 504 jnz syscall_trace_entry
507 cmpl $(nr_syscalls), %eax 505 cmpl $(NR_syscalls), %eax
508 jae syscall_badsys 506 jae syscall_badsys
509syscall_call: 507syscall_call:
510 call *sys_call_table(,%eax,4) 508 call *sys_call_table(,%eax,4)
@@ -625,6 +623,8 @@ work_notifysig: # deal with pending signals and
625 movl %esp, %eax 623 movl %esp, %eax
626 jne work_notifysig_v86 # returning to kernel-space or 624 jne work_notifysig_v86 # returning to kernel-space or
627 # vm86-space 625 # vm86-space
626 TRACE_IRQS_ON
627 ENABLE_INTERRUPTS(CLBR_NONE)
628 xorl %edx, %edx 628 xorl %edx, %edx
629 call do_notify_resume 629 call do_notify_resume
630 jmp resume_userspace_sig 630 jmp resume_userspace_sig
@@ -638,6 +638,8 @@ work_notifysig_v86:
638#else 638#else
639 movl %esp, %eax 639 movl %esp, %eax
640#endif 640#endif
641 TRACE_IRQS_ON
642 ENABLE_INTERRUPTS(CLBR_NONE)
641 xorl %edx, %edx 643 xorl %edx, %edx
642 call do_notify_resume 644 call do_notify_resume
643 jmp resume_userspace_sig 645 jmp resume_userspace_sig
@@ -650,7 +652,7 @@ syscall_trace_entry:
650 movl %esp, %eax 652 movl %esp, %eax
651 call syscall_trace_enter 653 call syscall_trace_enter
652 /* What it returned is what we'll actually use. */ 654 /* What it returned is what we'll actually use. */
653 cmpl $(nr_syscalls), %eax 655 cmpl $(NR_syscalls), %eax
654 jnae syscall_call 656 jnae syscall_call
655 jmp syscall_exit 657 jmp syscall_exit
656END(syscall_trace_entry) 658END(syscall_trace_entry)
@@ -690,29 +692,28 @@ END(syscall_badsys)
690 * System calls that need a pt_regs pointer. 692 * System calls that need a pt_regs pointer.
691 */ 693 */
692#define PTREGSCALL0(name) \ 694#define PTREGSCALL0(name) \
693 ALIGN; \ 695ENTRY(ptregs_##name) ; \
694ptregs_##name: \
695 leal 4(%esp),%eax; \ 696 leal 4(%esp),%eax; \
696 jmp sys_##name; 697 jmp sys_##name; \
698ENDPROC(ptregs_##name)
697 699
698#define PTREGSCALL1(name) \ 700#define PTREGSCALL1(name) \
699 ALIGN; \ 701ENTRY(ptregs_##name) ; \
700ptregs_##name: \
701 leal 4(%esp),%edx; \ 702 leal 4(%esp),%edx; \
702 movl (PT_EBX+4)(%esp),%eax; \ 703 movl (PT_EBX+4)(%esp),%eax; \
703 jmp sys_##name; 704 jmp sys_##name; \
705ENDPROC(ptregs_##name)
704 706
705#define PTREGSCALL2(name) \ 707#define PTREGSCALL2(name) \
706 ALIGN; \ 708ENTRY(ptregs_##name) ; \
707ptregs_##name: \
708 leal 4(%esp),%ecx; \ 709 leal 4(%esp),%ecx; \
709 movl (PT_ECX+4)(%esp),%edx; \ 710 movl (PT_ECX+4)(%esp),%edx; \
710 movl (PT_EBX+4)(%esp),%eax; \ 711 movl (PT_EBX+4)(%esp),%eax; \
711 jmp sys_##name; 712 jmp sys_##name; \
713ENDPROC(ptregs_##name)
712 714
713#define PTREGSCALL3(name) \ 715#define PTREGSCALL3(name) \
714 ALIGN; \ 716ENTRY(ptregs_##name) ; \
715ptregs_##name: \
716 CFI_STARTPROC; \ 717 CFI_STARTPROC; \
717 leal 4(%esp),%eax; \ 718 leal 4(%esp),%eax; \
718 pushl_cfi %eax; \ 719 pushl_cfi %eax; \
@@ -737,8 +738,7 @@ PTREGSCALL2(vm86)
737PTREGSCALL1(vm86old) 738PTREGSCALL1(vm86old)
738 739
739/* Clone is an oddball. The 4th arg is in %edi */ 740/* Clone is an oddball. The 4th arg is in %edi */
740 ALIGN; 741ENTRY(ptregs_clone)
741ptregs_clone:
742 CFI_STARTPROC 742 CFI_STARTPROC
743 leal 4(%esp),%eax 743 leal 4(%esp),%eax
744 pushl_cfi %eax 744 pushl_cfi %eax
@@ -1209,11 +1209,6 @@ return_to_handler:
1209 jmp *%ecx 1209 jmp *%ecx
1210#endif 1210#endif
1211 1211
1212.section .rodata,"a"
1213#include "syscall_table_32.S"
1214
1215syscall_table_size=(.-sys_call_table)
1216
1217/* 1212/*
1218 * Some functions should be protected against kprobes 1213 * Some functions should be protected against kprobes
1219 */ 1214 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..1333d9851778 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,7 @@
55#include <asm/paravirt.h> 55#include <asm/paravirt.h>
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <linux/err.h>
58 59
59/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
60#include <linux/elf-em.h> 61#include <linux/elf-em.h>
@@ -221,7 +222,7 @@ ENDPROC(native_usergs_sysret64)
221 /*CFI_REL_OFFSET ss,0*/ 222 /*CFI_REL_OFFSET ss,0*/
222 pushq_cfi %rax /* rsp */ 223 pushq_cfi %rax /* rsp */
223 CFI_REL_OFFSET rsp,0 224 CFI_REL_OFFSET rsp,0
224 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ 225 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
225 /*CFI_REL_OFFSET rflags,0*/ 226 /*CFI_REL_OFFSET rflags,0*/
226 pushq_cfi $__KERNEL_CS /* cs */ 227 pushq_cfi $__KERNEL_CS /* cs */
227 /*CFI_REL_OFFSET cs,0*/ 228 /*CFI_REL_OFFSET cs,0*/
@@ -411,7 +412,7 @@ ENTRY(ret_from_fork)
411 RESTORE_REST 412 RESTORE_REST
412 413
413 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 414 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
414 je int_ret_from_sys_call 415 jz retint_restore_args
415 416
416 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET 417 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
417 jnz int_ret_from_sys_call 418 jnz int_ret_from_sys_call
@@ -465,7 +466,7 @@ ENTRY(system_call)
465 * after the swapgs, so that it can do the swapgs 466 * after the swapgs, so that it can do the swapgs
466 * for the guest and jump here on syscall. 467 * for the guest and jump here on syscall.
467 */ 468 */
468ENTRY(system_call_after_swapgs) 469GLOBAL(system_call_after_swapgs)
469 470
470 movq %rsp,PER_CPU_VAR(old_rsp) 471 movq %rsp,PER_CPU_VAR(old_rsp)
471 movq PER_CPU_VAR(kernel_stack),%rsp 472 movq PER_CPU_VAR(kernel_stack),%rsp
@@ -478,8 +479,7 @@ ENTRY(system_call_after_swapgs)
478 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 479 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
479 movq %rcx,RIP-ARGOFFSET(%rsp) 480 movq %rcx,RIP-ARGOFFSET(%rsp)
480 CFI_REL_OFFSET rip,RIP-ARGOFFSET 481 CFI_REL_OFFSET rip,RIP-ARGOFFSET
481 GET_THREAD_INFO(%rcx) 482 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
482 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
483 jnz tracesys 483 jnz tracesys
484system_call_fastpath: 484system_call_fastpath:
485 cmpq $__NR_syscall_max,%rax 485 cmpq $__NR_syscall_max,%rax
@@ -496,10 +496,9 @@ ret_from_sys_call:
496 /* edi: flagmask */ 496 /* edi: flagmask */
497sysret_check: 497sysret_check:
498 LOCKDEP_SYS_EXIT 498 LOCKDEP_SYS_EXIT
499 GET_THREAD_INFO(%rcx)
500 DISABLE_INTERRUPTS(CLBR_NONE) 499 DISABLE_INTERRUPTS(CLBR_NONE)
501 TRACE_IRQS_OFF 500 TRACE_IRQS_OFF
502 movl TI_flags(%rcx),%edx 501 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
503 andl %edi,%edx 502 andl %edi,%edx
504 jnz sysret_careful 503 jnz sysret_careful
505 CFI_REMEMBER_STATE 504 CFI_REMEMBER_STATE
@@ -550,7 +549,7 @@ badsys:
550#ifdef CONFIG_AUDITSYSCALL 549#ifdef CONFIG_AUDITSYSCALL
551 /* 550 /*
552 * Fast path for syscall audit without full syscall trace. 551 * Fast path for syscall audit without full syscall trace.
553 * We just call audit_syscall_entry() directly, and then 552 * We just call __audit_syscall_entry() directly, and then
554 * jump back to the normal fast path. 553 * jump back to the normal fast path.
555 */ 554 */
556auditsys: 555auditsys:
@@ -560,22 +559,21 @@ auditsys:
560 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ 559 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
561 movq %rax,%rsi /* 2nd arg: syscall number */ 560 movq %rax,%rsi /* 2nd arg: syscall number */
562 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ 561 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
563 call audit_syscall_entry 562 call __audit_syscall_entry
564 LOAD_ARGS 0 /* reload call-clobbered registers */ 563 LOAD_ARGS 0 /* reload call-clobbered registers */
565 jmp system_call_fastpath 564 jmp system_call_fastpath
566 565
567 /* 566 /*
568 * Return fast path for syscall audit. Call audit_syscall_exit() 567 * Return fast path for syscall audit. Call __audit_syscall_exit()
569 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 568 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
570 * masked off. 569 * masked off.
571 */ 570 */
572sysret_audit: 571sysret_audit:
573 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ 572 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
574 cmpq $0,%rsi /* is it < 0? */ 573 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
575 setl %al /* 1 if so, 0 if not */ 574 setbe %al /* 1 if so, 0 if not */
576 movzbl %al,%edi /* zero-extend that into %edi */ 575 movzbl %al,%edi /* zero-extend that into %edi */
577 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ 576 call __audit_syscall_exit
578 call audit_syscall_exit
579 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 577 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
580 jmp sysret_check 578 jmp sysret_check
581#endif /* CONFIG_AUDITSYSCALL */ 579#endif /* CONFIG_AUDITSYSCALL */
@@ -583,7 +581,7 @@ sysret_audit:
583 /* Do syscall tracing */ 581 /* Do syscall tracing */
584tracesys: 582tracesys:
585#ifdef CONFIG_AUDITSYSCALL 583#ifdef CONFIG_AUDITSYSCALL
586 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 584 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
587 jz auditsys 585 jz auditsys
588#endif 586#endif
589 SAVE_REST 587 SAVE_REST
@@ -612,8 +610,6 @@ tracesys:
612GLOBAL(int_ret_from_sys_call) 610GLOBAL(int_ret_from_sys_call)
613 DISABLE_INTERRUPTS(CLBR_NONE) 611 DISABLE_INTERRUPTS(CLBR_NONE)
614 TRACE_IRQS_OFF 612 TRACE_IRQS_OFF
615 testl $3,CS-ARGOFFSET(%rsp)
616 je retint_restore_args
617 movl $_TIF_ALLWORK_MASK,%edi 613 movl $_TIF_ALLWORK_MASK,%edi
618 /* edi: mask to check */ 614 /* edi: mask to check */
619GLOBAL(int_with_check) 615GLOBAL(int_with_check)
@@ -953,6 +949,7 @@ END(common_interrupt)
953ENTRY(\sym) 949ENTRY(\sym)
954 INTR_FRAME 950 INTR_FRAME
955 pushq_cfi $~(\num) 951 pushq_cfi $~(\num)
952.Lcommon_\sym:
956 interrupt \do_sym 953 interrupt \do_sym
957 jmp ret_from_intr 954 jmp ret_from_intr
958 CFI_ENDPROC 955 CFI_ENDPROC
@@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
976 x86_platform_ipi smp_x86_platform_ipi 973 x86_platform_ipi smp_x86_platform_ipi
977 974
978#ifdef CONFIG_SMP 975#ifdef CONFIG_SMP
979.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 976 ALIGN
977 INTR_FRAME
978.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
980 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 979 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
981.if NUM_INVALIDATE_TLB_VECTORS > \idx 980.if NUM_INVALIDATE_TLB_VECTORS > \idx
982apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ 981ENTRY(invalidate_interrupt\idx)
983 invalidate_interrupt\idx smp_invalidate_interrupt 982 pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
983 jmp .Lcommon_invalidate_interrupt0
984 CFI_ADJUST_CFA_OFFSET -8
985END(invalidate_interrupt\idx)
984.endif 986.endif
985.endr 987.endr
988 CFI_ENDPROC
989apicinterrupt INVALIDATE_TLB_VECTOR_START, \
990 invalidate_interrupt0, smp_invalidate_interrupt
986#endif 991#endif
987 992
988apicinterrupt THRESHOLD_APIC_VECTOR \ 993apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1475,62 +1480,221 @@ ENTRY(error_exit)
1475 CFI_ENDPROC 1480 CFI_ENDPROC
1476END(error_exit) 1481END(error_exit)
1477 1482
1483/*
1484 * Test if a given stack is an NMI stack or not.
1485 */
1486 .macro test_in_nmi reg stack nmi_ret normal_ret
1487 cmpq %\reg, \stack
1488 ja \normal_ret
1489 subq $EXCEPTION_STKSZ, %\reg
1490 cmpq %\reg, \stack
1491 jb \normal_ret
1492 jmp \nmi_ret
1493 .endm
1478 1494
1479 /* runs on exception stack */ 1495 /* runs on exception stack */
1480ENTRY(nmi) 1496ENTRY(nmi)
1481 INTR_FRAME 1497 INTR_FRAME
1482 PARAVIRT_ADJUST_EXCEPTION_FRAME 1498 PARAVIRT_ADJUST_EXCEPTION_FRAME
1483 pushq_cfi $-1 1499 /*
1500 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1501 * the iretq it performs will take us out of NMI context.
1502 * This means that we can have nested NMIs where the next
1503 * NMI is using the top of the stack of the previous NMI. We
1504 * can't let it execute because the nested NMI will corrupt the
1505 * stack of the previous NMI. NMI handlers are not re-entrant
1506 * anyway.
1507 *
1508 * To handle this case we do the following:
1509 * Check the a special location on the stack that contains
1510 * a variable that is set when NMIs are executing.
1511 * The interrupted task's stack is also checked to see if it
1512 * is an NMI stack.
1513 * If the variable is not set and the stack is not the NMI
1514 * stack then:
1515 * o Set the special variable on the stack
1516 * o Copy the interrupt frame into a "saved" location on the stack
1517 * o Copy the interrupt frame into a "copy" location on the stack
1518 * o Continue processing the NMI
1519 * If the variable is set or the previous stack is the NMI stack:
1520 * o Modify the "copy" location to jump to the repeate_nmi
1521 * o return back to the first NMI
1522 *
1523 * Now on exit of the first NMI, we first clear the stack variable
1524 * The NMI stack will tell any nested NMIs at that point that it is
1525 * nested. Then we pop the stack normally with iret, and if there was
1526 * a nested NMI that updated the copy interrupt stack frame, a
1527 * jump will be made to the repeat_nmi code that will handle the second
1528 * NMI.
1529 */
1530
1531 /* Use %rdx as out temp variable throughout */
1532 pushq_cfi %rdx
1533
1534 /*
1535 * If %cs was not the kernel segment, then the NMI triggered in user
1536 * space, which means it is definitely not nested.
1537 */
1538 cmpl $__KERNEL_CS, 16(%rsp)
1539 jne first_nmi
1540
1541 /*
1542 * Check the special variable on the stack to see if NMIs are
1543 * executing.
1544 */
1545 cmpl $1, -8(%rsp)
1546 je nested_nmi
1547
1548 /*
1549 * Now test if the previous stack was an NMI stack.
1550 * We need the double check. We check the NMI stack to satisfy the
1551 * race when the first NMI clears the variable before returning.
1552 * We check the variable because the first NMI could be in a
1553 * breakpoint routine using a breakpoint stack.
1554 */
1555 lea 6*8(%rsp), %rdx
1556 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1557
1558nested_nmi:
1559 /*
1560 * Do nothing if we interrupted the fixup in repeat_nmi.
1561 * It's about to repeat the NMI handler, so we are fine
1562 * with ignoring this one.
1563 */
1564 movq $repeat_nmi, %rdx
1565 cmpq 8(%rsp), %rdx
1566 ja 1f
1567 movq $end_repeat_nmi, %rdx
1568 cmpq 8(%rsp), %rdx
1569 ja nested_nmi_out
1570
15711:
1572 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1573 leaq -6*8(%rsp), %rdx
1574 movq %rdx, %rsp
1575 CFI_ADJUST_CFA_OFFSET 6*8
1576 pushq_cfi $__KERNEL_DS
1577 pushq_cfi %rdx
1578 pushfq_cfi
1579 pushq_cfi $__KERNEL_CS
1580 pushq_cfi $repeat_nmi
1581
1582 /* Put stack back */
1583 addq $(11*8), %rsp
1584 CFI_ADJUST_CFA_OFFSET -11*8
1585
1586nested_nmi_out:
1587 popq_cfi %rdx
1588
1589 /* No need to check faults here */
1590 INTERRUPT_RETURN
1591
1592first_nmi:
1593 /*
1594 * Because nested NMIs will use the pushed location that we
1595 * stored in rdx, we must keep that space available.
1596 * Here's what our stack frame will look like:
1597 * +-------------------------+
1598 * | original SS |
1599 * | original Return RSP |
1600 * | original RFLAGS |
1601 * | original CS |
1602 * | original RIP |
1603 * +-------------------------+
1604 * | temp storage for rdx |
1605 * +-------------------------+
1606 * | NMI executing variable |
1607 * +-------------------------+
1608 * | Saved SS |
1609 * | Saved Return RSP |
1610 * | Saved RFLAGS |
1611 * | Saved CS |
1612 * | Saved RIP |
1613 * +-------------------------+
1614 * | copied SS |
1615 * | copied Return RSP |
1616 * | copied RFLAGS |
1617 * | copied CS |
1618 * | copied RIP |
1619 * +-------------------------+
1620 * | pt_regs |
1621 * +-------------------------+
1622 *
1623 * The saved RIP is used to fix up the copied RIP that a nested
1624 * NMI may zero out. The original stack frame and the temp storage
1625 * is also used by nested NMIs and can not be trusted on exit.
1626 */
1627 /* Set the NMI executing variable on the stack. */
1628 pushq_cfi $1
1629
1630 /* Copy the stack frame to the Saved frame */
1631 .rept 5
1632 pushq_cfi 6*8(%rsp)
1633 .endr
1634
1635 /* Make another copy, this one may be modified by nested NMIs */
1636 .rept 5
1637 pushq_cfi 4*8(%rsp)
1638 .endr
1639
1640 /* Do not pop rdx, nested NMIs will corrupt it */
1641 movq 11*8(%rsp), %rdx
1642
1643 /*
1644 * Everything below this point can be preempted by a nested
1645 * NMI if the first NMI took an exception. Repeated NMIs
1646 * caused by an exception and nested NMI will start here, and
1647 * can still be preempted by another NMI.
1648 */
1649restart_nmi:
1650 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1484 subq $ORIG_RAX-R15, %rsp 1651 subq $ORIG_RAX-R15, %rsp
1485 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1652 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1653 /*
1654 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1655 * as we should not be calling schedule in NMI context.
1656 * Even with normal interrupts enabled. An NMI should not be
1657 * setting NEED_RESCHED or anything that normal interrupts and
1658 * exceptions might do.
1659 */
1486 call save_paranoid 1660 call save_paranoid
1487 DEFAULT_FRAME 0 1661 DEFAULT_FRAME 0
1488 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1662 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1489 movq %rsp,%rdi 1663 movq %rsp,%rdi
1490 movq $-1,%rsi 1664 movq $-1,%rsi
1491 call do_nmi 1665 call do_nmi
1492#ifdef CONFIG_TRACE_IRQFLAGS
1493 /* paranoidexit; without TRACE_IRQS_OFF */
1494 /* ebx: no swapgs flag */
1495 DISABLE_INTERRUPTS(CLBR_NONE)
1496 testl %ebx,%ebx /* swapgs needed? */ 1666 testl %ebx,%ebx /* swapgs needed? */
1497 jnz nmi_restore 1667 jnz nmi_restore
1498 testl $3,CS(%rsp)
1499 jnz nmi_userspace
1500nmi_swapgs: 1668nmi_swapgs:
1501 SWAPGS_UNSAFE_STACK 1669 SWAPGS_UNSAFE_STACK
1502nmi_restore: 1670nmi_restore:
1503 RESTORE_ALL 8 1671 RESTORE_ALL 8
1672 /* Clear the NMI executing stack variable */
1673 movq $0, 10*8(%rsp)
1504 jmp irq_return 1674 jmp irq_return
1505nmi_userspace:
1506 GET_THREAD_INFO(%rcx)
1507 movl TI_flags(%rcx),%ebx
1508 andl $_TIF_WORK_MASK,%ebx
1509 jz nmi_swapgs
1510 movq %rsp,%rdi /* &pt_regs */
1511 call sync_regs
1512 movq %rax,%rsp /* switch stack for scheduling */
1513 testl $_TIF_NEED_RESCHED,%ebx
1514 jnz nmi_schedule
1515 movl %ebx,%edx /* arg3: thread flags */
1516 ENABLE_INTERRUPTS(CLBR_NONE)
1517 xorl %esi,%esi /* arg2: oldset */
1518 movq %rsp,%rdi /* arg1: &pt_regs */
1519 call do_notify_resume
1520 DISABLE_INTERRUPTS(CLBR_NONE)
1521 jmp nmi_userspace
1522nmi_schedule:
1523 ENABLE_INTERRUPTS(CLBR_ANY)
1524 call schedule
1525 DISABLE_INTERRUPTS(CLBR_ANY)
1526 jmp nmi_userspace
1527 CFI_ENDPROC 1675 CFI_ENDPROC
1528#else
1529 jmp paranoid_exit
1530 CFI_ENDPROC
1531#endif
1532END(nmi) 1676END(nmi)
1533 1677
1678 /*
1679 * If an NMI hit an iret because of an exception or breakpoint,
1680 * it can lose its NMI context, and a nested NMI may come in.
1681 * In that case, the nested NMI will change the preempted NMI's
1682 * stack to jump to here when it does the final iret.
1683 */
1684repeat_nmi:
1685 INTR_FRAME
1686 /* Update the stack variable to say we are still in NMI */
1687 movq $1, 5*8(%rsp)
1688
1689 /* copy the saved stack back to copy stack */
1690 .rept 5
1691 pushq_cfi 4*8(%rsp)
1692 .endr
1693
1694 jmp restart_nmi
1695 CFI_ENDPROC
1696end_repeat_nmi:
1697
1534ENTRY(ignore_sysret) 1698ENTRY(ignore_sysret)
1535 CFI_STARTPROC 1699 CFI_STARTPROC
1536 mov $-ENOSYS,%eax 1700 mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index af0699ba48cf..48d9d4ea1020 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)
52 lowmem = 0x9f000; 52 lowmem = 0x9f000;
53 53
54 /* reserve all memory between lowmem and the 1MB mark */ 54 /* reserve all memory between lowmem and the 1MB mark */
55 memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); 55 memblock_reserve(lowmem, 0x100000 - lowmem);
56} 56}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3bb08509a7a1..51ff18616d50 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void)
31 31
32void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
33{ 33{
34 memblock_init(); 34 memblock_reserve(__pa_symbol(&_text),
35 35 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
36 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
37 36
38#ifdef CONFIG_BLK_DEV_INITRD 37#ifdef CONFIG_BLK_DEV_INITRD
39 /* Reserve INITRD */ 38 /* Reserve INITRD */
@@ -42,7 +41,7 @@ void __init i386_start_kernel(void)
42 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 41 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
43 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 42 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
44 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 43 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
45 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); 44 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
46 } 45 }
47#endif 46#endif
48 47
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5655c2272adb..3a3b779f41d3 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 memblock_init(); 101 memblock_reserve(__pa_symbol(&_text),
102 102 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
103 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 103
105#ifdef CONFIG_BLK_DEV_INITRD 104#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 105 /* Reserve INITRD */
@@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
109 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 108 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
110 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 109 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
111 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 110 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
112 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); 111 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
113 } 112 }
114#endif 113#endif
115 114
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e39478a49..40f4eb3766d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
417ENTRY(idt_table) 417ENTRY(idt_table)
418 .skip IDT_ENTRIES * 16 418 .skip IDT_ENTRIES * 16
419 419
420 .align L1_CACHE_BYTES
421ENTRY(nmi_idt_table)
422 .skip IDT_ENTRIES * 16
423
420 __PAGE_ALIGNED_BSS 424 __PAGE_ALIGNED_BSS
421 .align PAGE_SIZE 425 .align PAGE_SIZE
422ENTRY(empty_zero_page) 426ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b946a9eac7d9..ad0de0c2714e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h> 3#include <linux/interrupt.h>
4#include <linux/export.h> 4#include <linux/export.h>
5#include <linux/sysdev.h>
6#include <linux/delay.h> 5#include <linux/delay.h>
7#include <linux/errno.h> 6#include <linux/errno.h>
8#include <linux/i8253.h> 7#include <linux/i8253.h>
@@ -32,8 +31,6 @@
32#define HPET_MIN_CYCLES 128 31#define HPET_MIN_CYCLES 128
33#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) 32#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
34 33
35#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
36
37/* 34/*
38 * HPET address is set in acpi/boot.c, when an ACPI entry exists 35 * HPET address is set in acpi/boot.c, when an ACPI entry exists
39 */ 36 */
@@ -55,6 +52,11 @@ struct hpet_dev {
55 char name[10]; 52 char name[10];
56}; 53};
57 54
55inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
56{
57 return container_of(evtdev, struct hpet_dev, evt);
58}
59
58inline unsigned int hpet_readl(unsigned int a) 60inline unsigned int hpet_readl(unsigned int a)
59{ 61{
60 return readl(hpet_virt_address + a); 62 return readl(hpet_virt_address + a);
@@ -1049,6 +1051,14 @@ int hpet_rtc_timer_init(void)
1049} 1051}
1050EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); 1052EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
1051 1053
1054static void hpet_disable_rtc_channel(void)
1055{
1056 unsigned long cfg;
1057 cfg = hpet_readl(HPET_T1_CFG);
1058 cfg &= ~HPET_TN_ENABLE;
1059 hpet_writel(cfg, HPET_T1_CFG);
1060}
1061
1052/* 1062/*
1053 * The functions below are called from rtc driver. 1063 * The functions below are called from rtc driver.
1054 * Return 0 if HPET is not being used. 1064 * Return 0 if HPET is not being used.
@@ -1060,6 +1070,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
1060 return 0; 1070 return 0;
1061 1071
1062 hpet_rtc_flags &= ~bit_mask; 1072 hpet_rtc_flags &= ~bit_mask;
1073 if (unlikely(!hpet_rtc_flags))
1074 hpet_disable_rtc_channel();
1075
1063 return 1; 1076 return 1;
1064} 1077}
1065EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); 1078EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
@@ -1125,15 +1138,11 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1125 1138
1126static void hpet_rtc_timer_reinit(void) 1139static void hpet_rtc_timer_reinit(void)
1127{ 1140{
1128 unsigned int cfg, delta; 1141 unsigned int delta;
1129 int lost_ints = -1; 1142 int lost_ints = -1;
1130 1143
1131 if (unlikely(!hpet_rtc_flags)) { 1144 if (unlikely(!hpet_rtc_flags))
1132 cfg = hpet_readl(HPET_T1_CFG); 1145 hpet_disable_rtc_channel();
1133 cfg &= ~HPET_TN_ENABLE;
1134 hpet_writel(cfg, HPET_T1_CFG);
1135 return;
1136 }
1137 1146
1138 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1147 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
1139 delta = hpet_default_delta; 1148 delta = hpet_default_delta;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 429e0c92924e..7943e0c21bde 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)
74 for_each_online_cpu(j) 74 for_each_online_cpu(j)
75 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); 75 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
76 seq_printf(p, " IRQ work interrupts\n"); 76 seq_printf(p, " IRQ work interrupts\n");
77 seq_printf(p, "%*s: ", prec, "RTR");
78 for_each_online_cpu(j)
79 seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
80 seq_printf(p, " APIC ICR read retries\n");
77#endif 81#endif
78 if (x86_platform_ipi_callback) { 82 if (x86_platform_ipi_callback) {
79 seq_printf(p, "%*s: ", prec, "PLT"); 83 seq_printf(p, "%*s: ", prec, "PLT");
@@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
136 sum += irq_stats(cpu)->irq_spurious_count; 140 sum += irq_stats(cpu)->irq_spurious_count;
137 sum += irq_stats(cpu)->apic_perf_irqs; 141 sum += irq_stats(cpu)->apic_perf_irqs;
138 sum += irq_stats(cpu)->apic_irq_work_irqs; 142 sum += irq_stats(cpu)->apic_irq_work_irqs;
143 sum += irq_stats(cpu)->icr_read_retry_count;
139#endif 144#endif
140 if (x86_platform_ipi_callback) 145 if (x86_platform_ipi_callback)
141 sum += irq_stats(cpu)->x86_platform_ipis; 146 sum += irq_stats(cpu)->x86_platform_ipis;
@@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
181 unsigned vector = ~regs->orig_ax; 186 unsigned vector = ~regs->orig_ax;
182 unsigned irq; 187 unsigned irq;
183 188
184 exit_idle();
185 irq_enter(); 189 irq_enter();
190 exit_idle();
186 191
187 irq = __this_cpu_read(vector_irq[vector]); 192 irq = __this_cpu_read(vector_irq[vector]);
188 193
@@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
209 214
210 ack_APIC_irq(); 215 ack_APIC_irq();
211 216
212 exit_idle();
213
214 irq_enter(); 217 irq_enter();
215 218
219 exit_idle();
220
216 inc_irq_stat(x86_platform_ipis); 221 inc_irq_stat(x86_platform_ipis);
217 222
218 if (x86_platform_ipi_callback) 223 if (x86_platform_ipi_callback)
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..40fc86161d92 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs); 28EXPORT_PER_CPU_SYMBOL(irq_regs);
29 29
30#ifdef CONFIG_DEBUG_STACKOVERFLOW 30#ifdef CONFIG_DEBUG_STACKOVERFLOW
31
32int sysctl_panic_on_stackoverflow __read_mostly;
33
31/* Debugging check for stack overflow: is there less than 1KB free? */ 34/* Debugging check for stack overflow: is there less than 1KB free? */
32static int check_stack_overflow(void) 35static int check_stack_overflow(void)
33{ 36{
@@ -43,6 +46,8 @@ static void print_stack_overflow(void)
43{ 46{
44 printk(KERN_WARNING "low stack detected by irq handler\n"); 47 printk(KERN_WARNING "low stack detected by irq handler\n");
45 dump_stack(); 48 dump_stack();
49 if (sysctl_panic_on_stackoverflow)
50 panic("low stack detected by irq handler - check messages\n");
46} 51}
47 52
48#else 53#else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index acf8fbf8fbda..d04d3ecded62 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
26DEFINE_PER_CPU(struct pt_regs *, irq_regs); 26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs); 27EXPORT_PER_CPU_SYMBOL(irq_regs);
28 28
29int sysctl_panic_on_stackoverflow;
30
29/* 31/*
30 * Probabilistic stack overflow check: 32 * Probabilistic stack overflow check:
31 * 33 *
@@ -36,15 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
36static inline void stack_overflow_check(struct pt_regs *regs) 38static inline void stack_overflow_check(struct pt_regs *regs)
37{ 39{
38#ifdef CONFIG_DEBUG_STACKOVERFLOW 40#ifdef CONFIG_DEBUG_STACKOVERFLOW
41#define STACK_TOP_MARGIN 128
42 struct orig_ist *oist;
43 u64 irq_stack_top, irq_stack_bottom;
44 u64 estack_top, estack_bottom;
39 u64 curbase = (u64)task_stack_page(current); 45 u64 curbase = (u64)task_stack_page(current);
40 46
41 WARN_ONCE(regs->sp >= curbase && 47 if (user_mode_vm(regs))
42 regs->sp <= curbase + THREAD_SIZE && 48 return;
43 regs->sp < curbase + sizeof(struct thread_info) + 49
44 sizeof(struct pt_regs) + 128, 50 if (regs->sp >= curbase + sizeof(struct thread_info) +
51 sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
52 regs->sp <= curbase + THREAD_SIZE)
53 return;
54
55 irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
56 STACK_TOP_MARGIN;
57 irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
58 if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
59 return;
60
61 oist = &__get_cpu_var(orig_ist);
62 estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
63 estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
64 if (regs->sp >= estack_top && regs->sp <= estack_bottom)
65 return;
66
67 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
68 current->comm, curbase, regs->sp,
69 irq_stack_top, irq_stack_bottom,
70 estack_top, estack_bottom);
45 71
46 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 72 if (sysctl_panic_on_stackoverflow)
47 current->comm, curbase, regs->sp); 73 panic("low stack detected by irq handler - check messages\n");
48#endif 74#endif
49} 75}
50 76
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6bacef..313fb5cddbce 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/sysdev.h> 12#include <linux/device.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include <linux/acpi.h> 14#include <linux/acpi.h>
15#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index ea9d5f2f13ef..2889b3d43882 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
50 put_online_cpus(); 50 put_online_cpus();
51} 51}
52 52
53void arch_jump_label_transform_static(struct jump_entry *entry, 53__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
54 enum jump_label_type type) 54 enum jump_label_type type)
55{ 55{
56 __jump_label_transform(entry, type, text_poke_early); 56 __jump_label_transform(entry, type, text_poke_early);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116001d6..f0c6fd6f176b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41 41
42#define MMU_QUEUE_SIZE 1024
43
44static int kvmapf = 1; 42static int kvmapf = 1;
45 43
46static int parse_no_kvmapf(char *arg) 44static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
60 58
61early_param("no-steal-acc", parse_no_stealacc); 59early_param("no-steal-acc", parse_no_stealacc);
62 60
63struct kvm_para_state {
64 u8 mmu_queue[MMU_QUEUE_SIZE];
65 int mmu_queue_len;
66};
67
68static DEFINE_PER_CPU(struct kvm_para_state, para_state);
69static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 61static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
70static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 62static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
71static int has_steal_clock = 0; 63static int has_steal_clock = 0;
72 64
73static struct kvm_para_state *kvm_para_state(void)
74{
75 return &per_cpu(para_state, raw_smp_processor_id());
76}
77
78/* 65/*
79 * No need for any "IO delay" on KVM 66 * No need for any "IO delay" on KVM
80 */ 67 */
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
271 } 258 }
272} 259}
273 260
274static void kvm_mmu_op(void *buffer, unsigned len)
275{
276 int r;
277 unsigned long a1, a2;
278
279 do {
280 a1 = __pa(buffer);
281 a2 = 0; /* on i386 __pa() always returns <4G */
282 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
283 buffer += r;
284 len -= r;
285 } while (len);
286}
287
288static void mmu_queue_flush(struct kvm_para_state *state)
289{
290 if (state->mmu_queue_len) {
291 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
292 state->mmu_queue_len = 0;
293 }
294}
295
296static void kvm_deferred_mmu_op(void *buffer, int len)
297{
298 struct kvm_para_state *state = kvm_para_state();
299
300 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
301 kvm_mmu_op(buffer, len);
302 return;
303 }
304 if (state->mmu_queue_len + len > sizeof state->mmu_queue)
305 mmu_queue_flush(state);
306 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
307 state->mmu_queue_len += len;
308}
309
310static void kvm_mmu_write(void *dest, u64 val)
311{
312 __u64 pte_phys;
313 struct kvm_mmu_op_write_pte wpte;
314
315#ifdef CONFIG_HIGHPTE
316 struct page *page;
317 unsigned long dst = (unsigned long) dest;
318
319 page = kmap_atomic_to_page(dest);
320 pte_phys = page_to_pfn(page);
321 pte_phys <<= PAGE_SHIFT;
322 pte_phys += (dst & ~(PAGE_MASK));
323#else
324 pte_phys = (unsigned long)__pa(dest);
325#endif
326 wpte.header.op = KVM_MMU_OP_WRITE_PTE;
327 wpte.pte_val = val;
328 wpte.pte_phys = pte_phys;
329
330 kvm_deferred_mmu_op(&wpte, sizeof wpte);
331}
332
333/*
334 * We only need to hook operations that are MMU writes. We hook these so that
335 * we can use lazy MMU mode to batch these operations. We could probably
336 * improve the performance of the host code if we used some of the information
337 * here to simplify processing of batched writes.
338 */
339static void kvm_set_pte(pte_t *ptep, pte_t pte)
340{
341 kvm_mmu_write(ptep, pte_val(pte));
342}
343
344static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
345 pte_t *ptep, pte_t pte)
346{
347 kvm_mmu_write(ptep, pte_val(pte));
348}
349
350static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
351{
352 kvm_mmu_write(pmdp, pmd_val(pmd));
353}
354
355#if PAGETABLE_LEVELS >= 3
356#ifdef CONFIG_X86_PAE
357static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
358{
359 kvm_mmu_write(ptep, pte_val(pte));
360}
361
362static void kvm_pte_clear(struct mm_struct *mm,
363 unsigned long addr, pte_t *ptep)
364{
365 kvm_mmu_write(ptep, 0);
366}
367
368static void kvm_pmd_clear(pmd_t *pmdp)
369{
370 kvm_mmu_write(pmdp, 0);
371}
372#endif
373
374static void kvm_set_pud(pud_t *pudp, pud_t pud)
375{
376 kvm_mmu_write(pudp, pud_val(pud));
377}
378
379#if PAGETABLE_LEVELS == 4
380static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
381{
382 kvm_mmu_write(pgdp, pgd_val(pgd));
383}
384#endif
385#endif /* PAGETABLE_LEVELS >= 3 */
386
387static void kvm_flush_tlb(void)
388{
389 struct kvm_mmu_op_flush_tlb ftlb = {
390 .header.op = KVM_MMU_OP_FLUSH_TLB,
391 };
392
393 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
394}
395
396static void kvm_release_pt(unsigned long pfn)
397{
398 struct kvm_mmu_op_release_pt rpt = {
399 .header.op = KVM_MMU_OP_RELEASE_PT,
400 .pt_phys = (u64)pfn << PAGE_SHIFT,
401 };
402
403 kvm_mmu_op(&rpt, sizeof rpt);
404}
405
406static void kvm_enter_lazy_mmu(void)
407{
408 paravirt_enter_lazy_mmu();
409}
410
411static void kvm_leave_lazy_mmu(void)
412{
413 struct kvm_para_state *state = kvm_para_state();
414
415 mmu_queue_flush(state);
416 paravirt_leave_lazy_mmu();
417}
418
419static void __init paravirt_ops_setup(void) 261static void __init paravirt_ops_setup(void)
420{ 262{
421 pv_info.name = "KVM"; 263 pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
424 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 266 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
425 pv_cpu_ops.io_delay = kvm_io_delay; 267 pv_cpu_ops.io_delay = kvm_io_delay;
426 268
427 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
428 pv_mmu_ops.set_pte = kvm_set_pte;
429 pv_mmu_ops.set_pte_at = kvm_set_pte_at;
430 pv_mmu_ops.set_pmd = kvm_set_pmd;
431#if PAGETABLE_LEVELS >= 3
432#ifdef CONFIG_X86_PAE
433 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
434 pv_mmu_ops.pte_clear = kvm_pte_clear;
435 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
436#endif
437 pv_mmu_ops.set_pud = kvm_set_pud;
438#if PAGETABLE_LEVELS == 4
439 pv_mmu_ops.set_pgd = kvm_set_pgd;
440#endif
441#endif
442 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
443 pv_mmu_ops.release_pte = kvm_release_pt;
444 pv_mmu_ops.release_pmd = kvm_release_pt;
445 pv_mmu_ops.release_pud = kvm_release_pt;
446
447 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
448 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
449 }
450#ifdef CONFIG_X86_IO_APIC 269#ifdef CONFIG_X86_IO_APIC
451 no_timer_check = 1; 270 no_timer_check = 1;
452#endif 271#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index d494799aafcd..73465aab28f8 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -1,14 +1,18 @@
1/* 1/*
2 * AMD CPU Microcode Update Driver for Linux 2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc. 3 * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
4 * 4 *
5 * Author: Peter Oruba <peter.oruba@amd.com> 5 * Author: Peter Oruba <peter.oruba@amd.com>
6 * 6 *
7 * Based on work by: 7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> 8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 * 9 *
10 * This driver allows to upgrade microcode on AMD 10 * Maintainers:
11 * family 0x10 and 0x11 processors. 11 * Andreas Herrmann <andreas.herrmann3@amd.com>
12 * Borislav Petkov <borislav.petkov@amd.com>
13 *
14 * This driver allows to upgrade microcode on F10h AMD
15 * CPUs and later.
12 * 16 *
13 * Licensed under the terms of the GNU General Public 17 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 18 * License version 2. See file COPYING for details.
@@ -71,6 +75,9 @@ struct microcode_amd {
71 75
72static struct equiv_cpu_entry *equiv_cpu_table; 76static struct equiv_cpu_entry *equiv_cpu_table;
73 77
78/* page-sized ucode patch buffer */
79void *patch;
80
74static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 81static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
75{ 82{
76 struct cpuinfo_x86 *c = &cpu_data(cpu); 83 struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
86 return 0; 93 return 0;
87} 94}
88 95
89static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, 96static unsigned int verify_ucode_size(int cpu, u32 patch_size,
90 int rev) 97 unsigned int size)
91{ 98{
92 unsigned int current_cpu_id; 99 struct cpuinfo_x86 *c = &cpu_data(cpu);
93 u16 equiv_cpu_id = 0; 100 u32 max_size;
94 unsigned int i = 0; 101
102#define F1XH_MPB_MAX_SIZE 2048
103#define F14H_MPB_MAX_SIZE 1824
104#define F15H_MPB_MAX_SIZE 4096
105
106 switch (c->x86) {
107 case 0x14:
108 max_size = F14H_MPB_MAX_SIZE;
109 break;
110 case 0x15:
111 max_size = F15H_MPB_MAX_SIZE;
112 break;
113 default:
114 max_size = F1XH_MPB_MAX_SIZE;
115 break;
116 }
117
118 if (patch_size > min_t(u32, size, max_size)) {
119 pr_err("patch size mismatch\n");
120 return 0;
121 }
122
123 return patch_size;
124}
125
126static u16 find_equiv_id(void)
127{
128 unsigned int current_cpu_id, i = 0;
95 129
96 BUG_ON(equiv_cpu_table == NULL); 130 BUG_ON(equiv_cpu_table == NULL);
131
97 current_cpu_id = cpuid_eax(0x00000001); 132 current_cpu_id = cpuid_eax(0x00000001);
98 133
99 while (equiv_cpu_table[i].installed_cpu != 0) { 134 while (equiv_cpu_table[i].installed_cpu != 0) {
100 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { 135 if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
101 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; 136 return equiv_cpu_table[i].equiv_cpu;
102 break; 137
103 }
104 i++; 138 i++;
105 } 139 }
140 return 0;
141}
142
143/*
144 * we signal a good patch is found by returning its size > 0
145 */
146static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
147 unsigned int leftover_size, int rev,
148 unsigned int *current_size)
149{
150 struct microcode_header_amd *mc_hdr;
151 unsigned int actual_size;
152 u16 equiv_cpu_id;
153
154 /* size of the current patch we're staring at */
155 *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
106 156
157 equiv_cpu_id = find_equiv_id();
107 if (!equiv_cpu_id) 158 if (!equiv_cpu_id)
108 return 0; 159 return 0;
109 160
161 /*
162 * let's look at the patch header itself now
163 */
164 mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
165
110 if (mc_hdr->processor_rev_id != equiv_cpu_id) 166 if (mc_hdr->processor_rev_id != equiv_cpu_id)
111 return 0; 167 return 0;
112 168
@@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
120 if (mc_hdr->patch_id <= rev) 176 if (mc_hdr->patch_id <= rev)
121 return 0; 177 return 0;
122 178
123 return 1; 179 /*
180 * now that the header looks sane, verify its size
181 */
182 actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
183 if (!actual_size)
184 return 0;
185
186 /* clear the patch buffer */
187 memset(patch, 0, PAGE_SIZE);
188
189 /* all looks ok, get the binary patch */
190 get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
191
192 return actual_size;
124} 193}
125 194
126static int apply_microcode_amd(int cpu) 195static int apply_microcode_amd(int cpu)
@@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu)
155 return 0; 224 return 0;
156} 225}
157 226
158static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159{
160 struct cpuinfo_x86 *c = &cpu_data(cpu);
161 u32 max_size, actual_size;
162
163#define F1XH_MPB_MAX_SIZE 2048
164#define F14H_MPB_MAX_SIZE 1824
165#define F15H_MPB_MAX_SIZE 4096
166
167 switch (c->x86) {
168 case 0x14:
169 max_size = F14H_MPB_MAX_SIZE;
170 break;
171 case 0x15:
172 max_size = F15H_MPB_MAX_SIZE;
173 break;
174 default:
175 max_size = F1XH_MPB_MAX_SIZE;
176 break;
177 }
178
179 actual_size = *(u32 *)(buf + 4);
180
181 if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
182 pr_err("section size mismatch\n");
183 return 0;
184 }
185
186 return actual_size;
187}
188
189static struct microcode_header_amd *
190get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
191{
192 struct microcode_header_amd *mc = NULL;
193 unsigned int actual_size = 0;
194
195 if (*(u32 *)buf != UCODE_UCODE_TYPE) {
196 pr_err("invalid type field in container file section header\n");
197 goto out;
198 }
199
200 actual_size = verify_ucode_size(cpu, buf, size);
201 if (!actual_size)
202 goto out;
203
204 mc = vzalloc(actual_size);
205 if (!mc)
206 goto out;
207
208 get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
209 *mc_size = actual_size + SECTION_HDR_SIZE;
210
211out:
212 return mc;
213}
214
215static int install_equiv_cpu_table(const u8 *buf) 227static int install_equiv_cpu_table(const u8 *buf)
216{ 228{
217 unsigned int *ibuf = (unsigned int *)buf; 229 unsigned int *ibuf = (unsigned int *)buf;
@@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
247{ 259{
248 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 260 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
249 struct microcode_header_amd *mc_hdr = NULL; 261 struct microcode_header_amd *mc_hdr = NULL;
250 unsigned int mc_size, leftover; 262 unsigned int mc_size, leftover, current_size = 0;
251 int offset; 263 int offset;
252 const u8 *ucode_ptr = data; 264 const u8 *ucode_ptr = data;
253 void *new_mc = NULL; 265 void *new_mc = NULL;
254 unsigned int new_rev = uci->cpu_sig.rev; 266 unsigned int new_rev = uci->cpu_sig.rev;
255 enum ucode_state state = UCODE_OK; 267 enum ucode_state state = UCODE_ERROR;
256 268
257 offset = install_equiv_cpu_table(ucode_ptr); 269 offset = install_equiv_cpu_table(ucode_ptr);
258 if (offset < 0) { 270 if (offset < 0) {
259 pr_err("failed to create equivalent cpu table\n"); 271 pr_err("failed to create equivalent cpu table\n");
260 return UCODE_ERROR; 272 goto out;
261 } 273 }
262
263 ucode_ptr += offset; 274 ucode_ptr += offset;
264 leftover = size - offset; 275 leftover = size - offset;
265 276
266 while (leftover) { 277 if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
267 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); 278 pr_err("invalid type field in container file section header\n");
268 if (!mc_hdr) 279 goto free_table;
269 break; 280 }
270 281
271 if (get_matching_microcode(cpu, mc_hdr, new_rev)) { 282 while (leftover) {
272 vfree(new_mc); 283 mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
284 new_rev, &current_size);
285 if (mc_size) {
286 mc_hdr = patch;
287 new_mc = patch;
273 new_rev = mc_hdr->patch_id; 288 new_rev = mc_hdr->patch_id;
274 new_mc = mc_hdr; 289 goto out_ok;
275 } else 290 }
276 vfree(mc_hdr);
277 291
278 ucode_ptr += mc_size; 292 ucode_ptr += current_size;
279 leftover -= mc_size; 293 leftover -= current_size;
280 } 294 }
281 295
282 if (!new_mc) { 296 if (!new_mc) {
@@ -284,29 +298,46 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
284 goto free_table; 298 goto free_table;
285 } 299 }
286 300
287 if (!leftover) { 301out_ok:
288 vfree(uci->mc); 302 uci->mc = new_mc;
289 uci->mc = new_mc; 303 state = UCODE_OK;
290 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", 304 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
291 cpu, uci->cpu_sig.rev, new_rev); 305 cpu, uci->cpu_sig.rev, new_rev);
292 } else {
293 vfree(new_mc);
294 state = UCODE_ERROR;
295 }
296 306
297free_table: 307free_table:
298 free_equiv_cpu_table(); 308 free_equiv_cpu_table();
299 309
310out:
300 return state; 311 return state;
301} 312}
302 313
314/*
315 * AMD microcode firmware naming convention, up to family 15h they are in
316 * the legacy file:
317 *
318 * amd-ucode/microcode_amd.bin
319 *
320 * This legacy file is always smaller than 2K in size.
321 *
322 * Starting at family 15h they are in family specific firmware files:
323 *
324 * amd-ucode/microcode_amd_fam15h.bin
325 * amd-ucode/microcode_amd_fam16h.bin
326 * ...
327 *
328 * These might be larger than 2K.
329 */
303static enum ucode_state request_microcode_amd(int cpu, struct device *device) 330static enum ucode_state request_microcode_amd(int cpu, struct device *device)
304{ 331{
305 const char *fw_name = "amd-ucode/microcode_amd.bin"; 332 char fw_name[36] = "amd-ucode/microcode_amd.bin";
306 const struct firmware *fw; 333 const struct firmware *fw;
307 enum ucode_state ret = UCODE_NFOUND; 334 enum ucode_state ret = UCODE_NFOUND;
335 struct cpuinfo_x86 *c = &cpu_data(cpu);
336
337 if (c->x86 >= 0x15)
338 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
308 339
309 if (request_firmware(&fw, fw_name, device)) { 340 if (request_firmware(&fw, (const char *)fw_name, device)) {
310 pr_err("failed to load file %s\n", fw_name); 341 pr_err("failed to load file %s\n", fw_name);
311 goto out; 342 goto out;
312 } 343 }
@@ -329,7 +360,6 @@ out:
329static enum ucode_state 360static enum ucode_state
330request_microcode_user(int cpu, const void __user *buf, size_t size) 361request_microcode_user(int cpu, const void __user *buf, size_t size)
331{ 362{
332 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
333 return UCODE_ERROR; 363 return UCODE_ERROR;
334} 364}
335 365
@@ -337,7 +367,6 @@ static void microcode_fini_cpu_amd(int cpu)
337{ 367{
338 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 368 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
339 369
340 vfree(uci->mc);
341 uci->mc = NULL; 370 uci->mc = NULL;
342} 371}
343 372
@@ -351,5 +380,14 @@ static struct microcode_ops microcode_amd_ops = {
351 380
352struct microcode_ops * __init init_amd_microcode(void) 381struct microcode_ops * __init init_amd_microcode(void)
353{ 382{
383 patch = (void *)get_zeroed_page(GFP_KERNEL);
384 if (!patch)
385 return NULL;
386
354 return &microcode_amd_ops; 387 return &microcode_amd_ops;
355} 388}
389
390void __exit exit_amd_microcode(void)
391{
392 free_page((unsigned long)patch);
393}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index f2d2a664e797..fda91c307104 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -256,7 +256,7 @@ static int __init microcode_dev_init(void)
256 return 0; 256 return 0;
257} 257}
258 258
259static void microcode_dev_exit(void) 259static void __exit microcode_dev_exit(void)
260{ 260{
261 misc_deregister(&microcode_dev); 261 misc_deregister(&microcode_dev);
262} 262}
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
292 return err; 292 return err;
293} 293}
294 294
295static ssize_t reload_store(struct sys_device *dev, 295static ssize_t reload_store(struct device *dev,
296 struct sysdev_attribute *attr, 296 struct device_attribute *attr,
297 const char *buf, size_t size) 297 const char *buf, size_t size)
298{ 298{
299 unsigned long val; 299 unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
318 return ret; 318 return ret;
319} 319}
320 320
321static ssize_t version_show(struct sys_device *dev, 321static ssize_t version_show(struct device *dev,
322 struct sysdev_attribute *attr, char *buf) 322 struct device_attribute *attr, char *buf)
323{ 323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 324 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
325 325
326 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); 326 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
327} 327}
328 328
329static ssize_t pf_show(struct sys_device *dev, 329static ssize_t pf_show(struct device *dev,
330 struct sysdev_attribute *attr, char *buf) 330 struct device_attribute *attr, char *buf)
331{ 331{
332 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 332 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
333 333
334 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); 334 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
335} 335}
336 336
337static SYSDEV_ATTR(reload, 0200, NULL, reload_store); 337static DEVICE_ATTR(reload, 0200, NULL, reload_store);
338static SYSDEV_ATTR(version, 0400, version_show, NULL); 338static DEVICE_ATTR(version, 0400, version_show, NULL);
339static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); 339static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
340 340
341static struct attribute *mc_default_attrs[] = { 341static struct attribute *mc_default_attrs[] = {
342 &attr_reload.attr, 342 &dev_attr_reload.attr,
343 &attr_version.attr, 343 &dev_attr_version.attr,
344 &attr_processor_flags.attr, 344 &dev_attr_processor_flags.attr,
345 NULL 345 NULL
346}; 346};
347 347
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
405 return ustate; 405 return ustate;
406} 406}
407 407
408static int mc_sysdev_add(struct sys_device *sys_dev) 408static int mc_device_add(struct device *dev, struct subsys_interface *sif)
409{ 409{
410 int err, cpu = sys_dev->id; 410 int err, cpu = dev->id;
411 411
412 if (!cpu_online(cpu)) 412 if (!cpu_online(cpu))
413 return 0; 413 return 0;
414 414
415 pr_debug("CPU%d added\n", cpu); 415 pr_debug("CPU%d added\n", cpu);
416 416
417 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 417 err = sysfs_create_group(&dev->kobj, &mc_attr_group);
418 if (err) 418 if (err)
419 return err; 419 return err;
420 420
421 if (microcode_init_cpu(cpu) == UCODE_ERROR) { 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 422 sysfs_remove_group(&dev->kobj, &mc_attr_group);
423 return -EINVAL; 423 return -EINVAL;
424 } 424 }
425 425
426 return err; 426 return err;
427} 427}
428 428
429static int mc_sysdev_remove(struct sys_device *sys_dev) 429static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
430{ 430{
431 int cpu = sys_dev->id; 431 int cpu = dev->id;
432 432
433 if (!cpu_online(cpu)) 433 if (!cpu_online(cpu))
434 return 0; 434 return 0;
435 435
436 pr_debug("CPU%d removed\n", cpu); 436 pr_debug("CPU%d removed\n", cpu);
437 microcode_fini_cpu(cpu); 437 microcode_fini_cpu(cpu);
438 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 438 sysfs_remove_group(&dev->kobj, &mc_attr_group);
439 return 0; 439 return 0;
440} 440}
441 441
442static struct sysdev_driver mc_sysdev_driver = { 442static struct subsys_interface mc_cpu_interface = {
443 .add = mc_sysdev_add, 443 .name = "microcode",
444 .remove = mc_sysdev_remove, 444 .subsys = &cpu_subsys,
445 .add_dev = mc_device_add,
446 .remove_dev = mc_device_remove,
445}; 447};
446 448
447/** 449/**
@@ -464,9 +466,9 @@ static __cpuinit int
464mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 466mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
465{ 467{
466 unsigned int cpu = (unsigned long)hcpu; 468 unsigned int cpu = (unsigned long)hcpu;
467 struct sys_device *sys_dev; 469 struct device *dev;
468 470
469 sys_dev = get_cpu_sysdev(cpu); 471 dev = get_cpu_device(cpu);
470 switch (action) { 472 switch (action) {
471 case CPU_ONLINE: 473 case CPU_ONLINE:
472 case CPU_ONLINE_FROZEN: 474 case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
474 case CPU_DOWN_FAILED: 476 case CPU_DOWN_FAILED:
475 case CPU_DOWN_FAILED_FROZEN: 477 case CPU_DOWN_FAILED_FROZEN:
476 pr_debug("CPU%d added\n", cpu); 478 pr_debug("CPU%d added\n", cpu);
477 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 479 if (sysfs_create_group(&dev->kobj, &mc_attr_group))
478 pr_err("Failed to create group for CPU%d\n", cpu); 480 pr_err("Failed to create group for CPU%d\n", cpu);
479 break; 481 break;
480 case CPU_DOWN_PREPARE: 482 case CPU_DOWN_PREPARE:
481 case CPU_DOWN_PREPARE_FROZEN: 483 case CPU_DOWN_PREPARE_FROZEN:
482 /* Suspend is in progress, only remove the interface */ 484 /* Suspend is in progress, only remove the interface */
483 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 485 sysfs_remove_group(&dev->kobj, &mc_attr_group);
484 pr_debug("CPU%d removed\n", cpu); 486 pr_debug("CPU%d removed\n", cpu);
485 break; 487 break;
486 488
@@ -519,27 +521,23 @@ static int __init microcode_init(void)
519 521
520 microcode_pdev = platform_device_register_simple("microcode", -1, 522 microcode_pdev = platform_device_register_simple("microcode", -1,
521 NULL, 0); 523 NULL, 0);
522 if (IS_ERR(microcode_pdev)) { 524 if (IS_ERR(microcode_pdev))
523 microcode_dev_exit();
524 return PTR_ERR(microcode_pdev); 525 return PTR_ERR(microcode_pdev);
525 }
526 526
527 get_online_cpus(); 527 get_online_cpus();
528 mutex_lock(&microcode_mutex); 528 mutex_lock(&microcode_mutex);
529 529
530 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 530 error = subsys_interface_register(&mc_cpu_interface);
531 531
532 mutex_unlock(&microcode_mutex); 532 mutex_unlock(&microcode_mutex);
533 put_online_cpus(); 533 put_online_cpus();
534 534
535 if (error) { 535 if (error)
536 platform_device_unregister(microcode_pdev); 536 goto out_pdev;
537 return error;
538 }
539 537
540 error = microcode_dev_init(); 538 error = microcode_dev_init();
541 if (error) 539 if (error)
542 return error; 540 goto out_driver;
543 541
544 register_syscore_ops(&mc_syscore_ops); 542 register_syscore_ops(&mc_syscore_ops);
545 register_hotcpu_notifier(&mc_cpu_notifier); 543 register_hotcpu_notifier(&mc_cpu_notifier);
@@ -548,11 +546,27 @@ static int __init microcode_init(void)
548 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); 546 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
549 547
550 return 0; 548 return 0;
549
550out_driver:
551 get_online_cpus();
552 mutex_lock(&microcode_mutex);
553
554 subsys_interface_unregister(&mc_cpu_interface);
555
556 mutex_unlock(&microcode_mutex);
557 put_online_cpus();
558
559out_pdev:
560 platform_device_unregister(microcode_pdev);
561 return error;
562
551} 563}
552module_init(microcode_init); 564module_init(microcode_init);
553 565
554static void __exit microcode_exit(void) 566static void __exit microcode_exit(void)
555{ 567{
568 struct cpuinfo_x86 *c = &cpu_data(0);
569
556 microcode_dev_exit(); 570 microcode_dev_exit();
557 571
558 unregister_hotcpu_notifier(&mc_cpu_notifier); 572 unregister_hotcpu_notifier(&mc_cpu_notifier);
@@ -561,7 +575,7 @@ static void __exit microcode_exit(void)
561 get_online_cpus(); 575 get_online_cpus();
562 mutex_lock(&microcode_mutex); 576 mutex_lock(&microcode_mutex);
563 577
564 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 578 subsys_interface_unregister(&mc_cpu_interface);
565 579
566 mutex_unlock(&microcode_mutex); 580 mutex_unlock(&microcode_mutex);
567 put_online_cpus(); 581 put_online_cpus();
@@ -570,6 +584,9 @@ static void __exit microcode_exit(void)
570 584
571 microcode_ops = NULL; 585 microcode_ops = NULL;
572 586
587 if (c->x86_vendor == X86_VENDOR_AMD)
588 exit_amd_microcode();
589
573 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 590 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
574} 591}
575module_exit(microcode_exit); 592module_exit(microcode_exit);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 9103b89c145a..ca470e4c92dc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
95 } 95 }
96#endif 96#endif
97 97
98 set_bit(m->busid, mp_bus_not_pci);
98 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { 99 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
99 set_bit(m->busid, mp_bus_not_pci);
100#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 100#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
101 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 101 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
102#endif 102#endif
@@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early)
564 564
565static void __init smp_reserve_memory(struct mpf_intel *mpf) 565static void __init smp_reserve_memory(struct mpf_intel *mpf)
566{ 566{
567 unsigned long size = get_mpc_size(mpf->physptr); 567 memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
568
569 memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
570} 568}
571 569
572static int __init smp_scan_config(unsigned long base, unsigned long length) 570static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
595 mpf, (u64)virt_to_phys(mpf)); 593 mpf, (u64)virt_to_phys(mpf));
596 594
597 mem = virt_to_phys(mpf); 595 mem = virt_to_phys(mpf);
598 memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); 596 memblock_reserve(mem, sizeof(*mpf));
599 if (mpf->physptr) 597 if (mpf->physptr)
600 smp_reserve_memory(mpf); 598 smp_reserve_memory(mpf);
601 599
@@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);
836 834
837void __init early_reserve_e820_mpc_new(void) 835void __init early_reserve_e820_mpc_new(void)
838{ 836{
839 if (enable_update_mptable && alloc_mptable) { 837 if (enable_update_mptable && alloc_mptable)
840 u64 startt = 0; 838 mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
841 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
842 }
843} 839}
844 840
845static int __init update_mp_table(void) 841static int __init update_mp_table(void)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2c143e..96356762a51d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
236 .notifier_call = msr_class_cpu_callback, 236 .notifier_call = msr_class_cpu_callback,
237}; 237};
238 238
239static char *msr_devnode(struct device *dev, mode_t *mode) 239static char *msr_devnode(struct device *dev, umode_t *mode)
240{ 240{
241 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 241 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
242} 242}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e88f37b58ddd..47acaf319165 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
405 unknown_nmi_error(reason, regs); 405 unknown_nmi_error(reason, regs);
406} 406}
407 407
408/*
409 * NMIs can hit breakpoints which will cause it to lose its
410 * NMI context with the CPU when the breakpoint does an iret.
411 */
412#ifdef CONFIG_X86_32
413/*
414 * For i386, NMIs use the same stack as the kernel, and we can
415 * add a workaround to the iret problem in C. Simply have 3 states
416 * the NMI can be in.
417 *
418 * 1) not running
419 * 2) executing
420 * 3) latched
421 *
422 * When no NMI is in progress, it is in the "not running" state.
423 * When an NMI comes in, it goes into the "executing" state.
424 * Normally, if another NMI is triggered, it does not interrupt
425 * the running NMI and the HW will simply latch it so that when
426 * the first NMI finishes, it will restart the second NMI.
427 * (Note, the latch is binary, thus multiple NMIs triggering,
428 * when one is running, are ignored. Only one NMI is restarted.)
429 *
430 * If an NMI hits a breakpoint that executes an iret, another
431 * NMI can preempt it. We do not want to allow this new NMI
432 * to run, but we want to execute it when the first one finishes.
433 * We set the state to "latched", and the first NMI will perform
434 * an cmpxchg on the state, and if it doesn't successfully
435 * reset the state to "not running" it will restart the next
436 * NMI.
437 */
438enum nmi_states {
439 NMI_NOT_RUNNING,
440 NMI_EXECUTING,
441 NMI_LATCHED,
442};
443static DEFINE_PER_CPU(enum nmi_states, nmi_state);
444
445#define nmi_nesting_preprocess(regs) \
446 do { \
447 if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
448 __get_cpu_var(nmi_state) = NMI_LATCHED; \
449 return; \
450 } \
451 nmi_restart: \
452 __get_cpu_var(nmi_state) = NMI_EXECUTING; \
453 } while (0)
454
455#define nmi_nesting_postprocess() \
456 do { \
457 if (cmpxchg(&__get_cpu_var(nmi_state), \
458 NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
459 goto nmi_restart; \
460 } while (0)
461#else /* x86_64 */
462/*
463 * In x86_64 things are a bit more difficult. This has the same problem
464 * where an NMI hitting a breakpoint that calls iret will remove the
465 * NMI context, allowing a nested NMI to enter. What makes this more
466 * difficult is that both NMIs and breakpoints have their own stack.
467 * When a new NMI or breakpoint is executed, the stack is set to a fixed
468 * point. If an NMI is nested, it will have its stack set at that same
469 * fixed address that the first NMI had, and will start corrupting the
470 * stack. This is handled in entry_64.S, but the same problem exists with
471 * the breakpoint stack.
472 *
473 * If a breakpoint is being processed, and the debug stack is being used,
474 * if an NMI comes in and also hits a breakpoint, the stack pointer
475 * will be set to the same fixed address as the breakpoint that was
476 * interrupted, causing that stack to be corrupted. To handle this case,
477 * check if the stack that was interrupted is the debug stack, and if
478 * so, change the IDT so that new breakpoints will use the current stack
479 * and not switch to the fixed address. On return of the NMI, switch back
480 * to the original IDT.
481 */
482static DEFINE_PER_CPU(int, update_debug_stack);
483
484static inline void nmi_nesting_preprocess(struct pt_regs *regs)
485{
486 /*
487 * If we interrupted a breakpoint, it is possible that
488 * the nmi handler will have breakpoints too. We need to
489 * change the IDT such that breakpoints that happen here
490 * continue to use the NMI stack.
491 */
492 if (unlikely(is_debug_stack(regs->sp))) {
493 debug_stack_set_zero();
494 __get_cpu_var(update_debug_stack) = 1;
495 }
496}
497
498static inline void nmi_nesting_postprocess(void)
499{
500 if (unlikely(__get_cpu_var(update_debug_stack)))
501 debug_stack_reset();
502}
503#endif
504
408dotraplinkage notrace __kprobes void 505dotraplinkage notrace __kprobes void
409do_nmi(struct pt_regs *regs, long error_code) 506do_nmi(struct pt_regs *regs, long error_code)
410{ 507{
508 nmi_nesting_preprocess(regs);
509
411 nmi_enter(); 510 nmi_enter();
412 511
413 inc_irq_stat(__nmi_count); 512 inc_irq_stat(__nmi_count);
@@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code)
416 default_do_nmi(regs); 515 default_do_nmi(regs);
417 516
418 nmi_exit(); 517 nmi_exit();
518
519 /* On i386, may loop back to preprocess */
520 nmi_nesting_postprocess();
419} 521}
420 522
421void stop_nmi(void) 523void stop_nmi(void)
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..0d01a8ea4e11
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,180 @@
1/*
2 * arch/x86/kernel/nmi-selftest.c
3 *
4 * Testsuite for NMI: IPIs
5 *
6 * Started by Don Zickus:
7 * (using lib/locking-selftest.c as a guide)
8 *
9 * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
10 */
11
12#include <linux/smp.h>
13#include <linux/cpumask.h>
14#include <linux/delay.h>
15
16#include <asm/apic.h>
17#include <asm/nmi.h>
18
19#define SUCCESS 0
20#define FAILURE 1
21#define TIMEOUT 2
22
23static int nmi_fail;
24
25/* check to see if NMI IPIs work on this machine */
26static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
27
28static int testcase_total;
29static int testcase_successes;
30static int expected_testcase_failures;
31static int unexpected_testcase_failures;
32static int unexpected_testcase_unknowns;
33
34static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
35{
36 unexpected_testcase_unknowns++;
37 return NMI_HANDLED;
38}
39
40static void init_nmi_testsuite(void)
41{
42 /* trap all the unknown NMIs we may generate */
43 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
44}
45
46static void cleanup_nmi_testsuite(void)
47{
48 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
49}
50
51static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
52{
53 int cpu = raw_smp_processor_id();
54
55 if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
56 return NMI_HANDLED;
57
58 return NMI_DONE;
59}
60
61static void test_nmi_ipi(struct cpumask *mask)
62{
63 unsigned long timeout;
64
65 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
66 NMI_FLAG_FIRST, "nmi_selftest")) {
67 nmi_fail = FAILURE;
68 return;
69 }
70
71 /* sync above data before sending NMI */
72 wmb();
73
74 apic->send_IPI_mask(mask, NMI_VECTOR);
75
76 /* Don't wait longer than a second */
77 timeout = USEC_PER_SEC;
78 while (!cpumask_empty(mask) && timeout--)
79 udelay(1);
80
81 /* What happens if we timeout, do we still unregister?? */
82 unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
83
84 if (!timeout)
85 nmi_fail = TIMEOUT;
86 return;
87}
88
89static void remote_ipi(void)
90{
91 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
92 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
93 if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
94 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
95}
96
97static void local_ipi(void)
98{
99 cpumask_clear(to_cpumask(nmi_ipi_mask));
100 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
101 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
102}
103
104static void reset_nmi(void)
105{
106 nmi_fail = 0;
107}
108
109static void dotest(void (*testcase_fn)(void), int expected)
110{
111 testcase_fn();
112 /*
113 * Filter out expected failures:
114 */
115 if (nmi_fail != expected) {
116 unexpected_testcase_failures++;
117
118 if (nmi_fail == FAILURE)
119 printk("FAILED |");
120 else if (nmi_fail == TIMEOUT)
121 printk("TIMEOUT|");
122 else
123 printk("ERROR |");
124 dump_stack();
125 } else {
126 testcase_successes++;
127 printk(" ok |");
128 }
129 testcase_total++;
130
131 reset_nmi();
132}
133
134static inline void print_testname(const char *testname)
135{
136 printk("%12s:", testname);
137}
138
139void nmi_selftest(void)
140{
141 init_nmi_testsuite();
142
143 /*
144 * Run the testsuite:
145 */
146 printk("----------------\n");
147 printk("| NMI testsuite:\n");
148 printk("--------------------\n");
149
150 print_testname("remote IPI");
151 dotest(remote_ipi, SUCCESS);
152 printk("\n");
153 print_testname("local IPI");
154 dotest(local_ipi, SUCCESS);
155 printk("\n");
156
157 cleanup_nmi_testsuite();
158
159 if (unexpected_testcase_failures) {
160 printk("--------------------\n");
161 printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
162 unexpected_testcase_failures, testcase_total);
163 printk("-----------------------------------------------------------------\n");
164 } else if (expected_testcase_failures && testcase_successes) {
165 printk("--------------------\n");
166 printk("%3d out of %3d testcases failed, as expected. |\n",
167 expected_testcase_failures, testcase_total);
168 printk("----------------------------------------------------\n");
169 } else if (expected_testcase_failures && !testcase_successes) {
170 printk("--------------------\n");
171 printk("All %3d testcases failed, as expected. |\n",
172 expected_testcase_failures);
173 printk("----------------------------------------\n");
174 } else {
175 printk("--------------------\n");
176 printk("Good, all %3d testcases passed! |\n",
177 testcase_successes);
178 printk("---------------------------------\n");
179 }
180}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793b3f63..1c4d769e21ea 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
45 */ 45 */
46int iommu_pass_through __read_mostly; 46int iommu_pass_through __read_mostly;
47 47
48/*
49 * Group multi-function PCI devices into a single device-group for the
50 * iommu_device_group interface. This tells the iommu driver to pretend
51 * it cannot distinguish between functions of a device, exposing only one
52 * group for the device. Useful for disallowing use of individual PCI
53 * functions from userspace drivers.
54 */
55int iommu_group_mf __read_mostly;
56
48extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 57extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
49 58
50/* Dummy device used for NULL arguments (normally ISA). */ 59/* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
169#endif 178#endif
170 if (!strncmp(p, "pt", 2)) 179 if (!strncmp(p, "pt", 2))
171 iommu_pass_through = 1; 180 iommu_pass_through = 1;
181 if (!strncmp(p, "group_mf", 8))
182 iommu_group_mf = 1;
172 183
173 gart_parse_options(p); 184 gart_parse_options(p);
174 185
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b9b3b1a51643..15763af7bfe3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
293 regs.orig_ax = -1; 293 regs.orig_ax = -1;
294 regs.ip = (unsigned long) kernel_thread_helper; 294 regs.ip = (unsigned long) kernel_thread_helper;
295 regs.cs = __KERNEL_CS | get_kernel_rpl(); 295 regs.cs = __KERNEL_CS | get_kernel_rpl();
296 regs.flags = X86_EFLAGS_IF | 0x2; 296 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
297 297
298 /* Ok, create the new process.. */ 298 /* Ok, create the new process.. */
299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -403,6 +403,14 @@ void default_idle(void)
403EXPORT_SYMBOL(default_idle); 403EXPORT_SYMBOL(default_idle);
404#endif 404#endif
405 405
406bool set_pm_idle_to_default(void)
407{
408 bool ret = !!pm_idle;
409
410 pm_idle = default_idle;
411
412 return ret;
413}
406void stop_this_cpu(void *dummy) 414void stop_this_cpu(void *dummy)
407{ 415{
408 local_irq_disable(); 416 local_irq_disable();
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 795b79f984c2..c08d1ff12b7c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -99,7 +99,8 @@ void cpu_idle(void)
99 99
100 /* endless idle loop with no priority at all */ 100 /* endless idle loop with no priority at all */
101 while (1) { 101 while (1) {
102 tick_nohz_stop_sched_tick(1); 102 tick_nohz_idle_enter();
103 rcu_idle_enter();
103 while (!need_resched()) { 104 while (!need_resched()) {
104 105
105 check_pgt_cache(); 106 check_pgt_cache();
@@ -116,7 +117,8 @@ void cpu_idle(void)
116 pm_idle(); 117 pm_idle();
117 start_critical_timings(); 118 start_critical_timings();
118 } 119 }
119 tick_nohz_restart_sched_tick(); 120 rcu_idle_exit();
121 tick_nohz_idle_exit();
120 preempt_enable_no_resched(); 122 preempt_enable_no_resched();
121 schedule(); 123 schedule();
122 preempt_disable(); 124 preempt_disable();
@@ -212,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
212 214
213 task_user_gs(p) = get_user_gs(regs); 215 task_user_gs(p) = get_user_gs(regs);
214 216
217 p->fpu_counter = 0;
215 p->thread.io_bitmap_ptr = NULL; 218 p->thread.io_bitmap_ptr = NULL;
216 tsk = current; 219 tsk = current;
217 err = -ENOMEM; 220 err = -ENOMEM;
@@ -297,22 +300,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
297 *next = &next_p->thread; 300 *next = &next_p->thread;
298 int cpu = smp_processor_id(); 301 int cpu = smp_processor_id();
299 struct tss_struct *tss = &per_cpu(init_tss, cpu); 302 struct tss_struct *tss = &per_cpu(init_tss, cpu);
300 bool preload_fpu; 303 fpu_switch_t fpu;
301 304
302 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 305 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
303 306
304 /* 307 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
305 * If the task has used fpu the last 5 timeslices, just do a full
306 * restore of the math state immediately to avoid the trap; the
307 * chances of needing FPU soon are obviously high now
308 */
309 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
310
311 __unlazy_fpu(prev_p);
312
313 /* we're going to use this soon, after a few expensive things */
314 if (preload_fpu)
315 prefetch(next->fpu.state);
316 308
317 /* 309 /*
318 * Reload esp0. 310 * Reload esp0.
@@ -352,11 +344,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
352 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) 344 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
353 __switch_to_xtra(prev_p, next_p, tss); 345 __switch_to_xtra(prev_p, next_p, tss);
354 346
355 /* If we're going to preload the fpu context, make sure clts
356 is run while we're batching the cpu state updates. */
357 if (preload_fpu)
358 clts();
359
360 /* 347 /*
361 * Leave lazy mode, flushing any hypercalls made here. 348 * Leave lazy mode, flushing any hypercalls made here.
362 * This must be done before restoring TLS segments so 349 * This must be done before restoring TLS segments so
@@ -366,15 +353,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
366 */ 353 */
367 arch_end_context_switch(next_p); 354 arch_end_context_switch(next_p);
368 355
369 if (preload_fpu)
370 __math_state_restore();
371
372 /* 356 /*
373 * Restore %gs if needed (which is common) 357 * Restore %gs if needed (which is common)
374 */ 358 */
375 if (prev->gs | next->gs) 359 if (prev->gs | next->gs)
376 lazy_load_gs(next->gs); 360 lazy_load_gs(next->gs);
377 361
362 switch_fpu_finish(next_p, fpu);
363
378 percpu_write(current_task, next_p); 364 percpu_write(current_task, next_p);
379 365
380 return prev_p; 366 return prev_p;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3bd7e6eebf31..cfa5c90c01db 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -122,7 +122,7 @@ void cpu_idle(void)
122 122
123 /* endless idle loop with no priority at all */ 123 /* endless idle loop with no priority at all */
124 while (1) { 124 while (1) {
125 tick_nohz_stop_sched_tick(1); 125 tick_nohz_idle_enter();
126 while (!need_resched()) { 126 while (!need_resched()) {
127 127
128 rmb(); 128 rmb();
@@ -139,8 +139,14 @@ void cpu_idle(void)
139 enter_idle(); 139 enter_idle();
140 /* Don't trace irqs off for idle */ 140 /* Don't trace irqs off for idle */
141 stop_critical_timings(); 141 stop_critical_timings();
142
143 /* enter_idle() needs rcu for notifiers */
144 rcu_idle_enter();
145
142 if (cpuidle_idle_call()) 146 if (cpuidle_idle_call())
143 pm_idle(); 147 pm_idle();
148
149 rcu_idle_exit();
144 start_critical_timings(); 150 start_critical_timings();
145 151
146 /* In many cases the interrupt that ended idle 152 /* In many cases the interrupt that ended idle
@@ -149,7 +155,7 @@ void cpu_idle(void)
149 __exit_idle(); 155 __exit_idle();
150 } 156 }
151 157
152 tick_nohz_restart_sched_tick(); 158 tick_nohz_idle_exit();
153 preempt_enable_no_resched(); 159 preempt_enable_no_resched();
154 schedule(); 160 schedule();
155 preempt_disable(); 161 preempt_disable();
@@ -280,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
280 286
281 set_tsk_thread_flag(p, TIF_FORK); 287 set_tsk_thread_flag(p, TIF_FORK);
282 288
289 p->fpu_counter = 0;
283 p->thread.io_bitmap_ptr = NULL; 290 p->thread.io_bitmap_ptr = NULL;
284 291
285 savesegment(gs, p->thread.gsindex); 292 savesegment(gs, p->thread.gsindex);
@@ -293,13 +300,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
293 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 300 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
294 301
295 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 302 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
296 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 303 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
304 IO_BITMAP_BYTES, GFP_KERNEL);
297 if (!p->thread.io_bitmap_ptr) { 305 if (!p->thread.io_bitmap_ptr) {
298 p->thread.io_bitmap_max = 0; 306 p->thread.io_bitmap_max = 0;
299 return -ENOMEM; 307 return -ENOMEM;
300 } 308 }
301 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
302 IO_BITMAP_BYTES);
303 set_tsk_thread_flag(p, TIF_IO_BITMAP); 309 set_tsk_thread_flag(p, TIF_IO_BITMAP);
304 } 310 }
305 311
@@ -381,18 +387,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
381 int cpu = smp_processor_id(); 387 int cpu = smp_processor_id();
382 struct tss_struct *tss = &per_cpu(init_tss, cpu); 388 struct tss_struct *tss = &per_cpu(init_tss, cpu);
383 unsigned fsindex, gsindex; 389 unsigned fsindex, gsindex;
384 bool preload_fpu; 390 fpu_switch_t fpu;
385
386 /*
387 * If the task has used fpu the last 5 timeslices, just do a full
388 * restore of the math state immediately to avoid the trap; the
389 * chances of needing FPU soon are obviously high now
390 */
391 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
392 391
393 /* we're going to use this soon, after a few expensive things */ 392 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
394 if (preload_fpu)
395 prefetch(next->fpu.state);
396 393
397 /* 394 /*
398 * Reload esp0, LDT and the page table pointer: 395 * Reload esp0, LDT and the page table pointer:
@@ -422,13 +419,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
422 419
423 load_TLS(next, cpu); 420 load_TLS(next, cpu);
424 421
425 /* Must be after DS reload */
426 __unlazy_fpu(prev_p);
427
428 /* Make sure cpu is ready for new context */
429 if (preload_fpu)
430 clts();
431
432 /* 422 /*
433 * Leave lazy mode, flushing any hypercalls made here. 423 * Leave lazy mode, flushing any hypercalls made here.
434 * This must be done before restoring TLS segments so 424 * This must be done before restoring TLS segments so
@@ -469,6 +459,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
469 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 459 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
470 prev->gsindex = gsindex; 460 prev->gsindex = gsindex;
471 461
462 switch_fpu_finish(next_p, fpu);
463
472 /* 464 /*
473 * Switch the PDA and FPU contexts. 465 * Switch the PDA and FPU contexts.
474 */ 466 */
@@ -487,13 +479,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
487 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 479 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
488 __switch_to_xtra(prev_p, next_p, tss); 480 __switch_to_xtra(prev_p, next_p, tss);
489 481
490 /*
491 * Preload the FPU context, now that we've determined that the
492 * task is likely to be using it.
493 */
494 if (preload_fpu)
495 __math_state_restore();
496
497 return prev_p; 482 return prev_p;
498} 483}
499 484
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 82528799c5de..50267386b766 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -749,7 +749,8 @@ put:
749/* 749/*
750 * Handle PTRACE_POKEUSR calls for the debug register area. 750 * Handle PTRACE_POKEUSR calls for the debug register area.
751 */ 751 */
752int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) 752static int ptrace_set_debugreg(struct task_struct *tsk, int n,
753 unsigned long val)
753{ 754{
754 struct thread_struct *thread = &(tsk->thread); 755 struct thread_struct *thread = &(tsk->thread);
755 int rc = 0; 756 int rc = 0;
@@ -1391,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)
1391 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1392 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1392 trace_sys_enter(regs, regs->orig_ax); 1393 trace_sys_enter(regs, regs->orig_ax);
1393 1394
1394 if (unlikely(current->audit_context)) { 1395 if (IS_IA32)
1395 if (IS_IA32) 1396 audit_syscall_entry(AUDIT_ARCH_I386,
1396 audit_syscall_entry(AUDIT_ARCH_I386, 1397 regs->orig_ax,
1397 regs->orig_ax, 1398 regs->bx, regs->cx,
1398 regs->bx, regs->cx, 1399 regs->dx, regs->si);
1399 regs->dx, regs->si);
1400#ifdef CONFIG_X86_64 1400#ifdef CONFIG_X86_64
1401 else 1401 else
1402 audit_syscall_entry(AUDIT_ARCH_X86_64, 1402 audit_syscall_entry(AUDIT_ARCH_X86_64,
1403 regs->orig_ax, 1403 regs->orig_ax,
1404 regs->di, regs->si, 1404 regs->di, regs->si,
1405 regs->dx, regs->r10); 1405 regs->dx, regs->r10);
1406#endif 1406#endif
1407 }
1408 1407
1409 return ret ?: regs->orig_ax; 1408 return ret ?: regs->orig_ax;
1410} 1409}
@@ -1413,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs)
1413{ 1412{
1414 bool step; 1413 bool step;
1415 1414
1416 if (unlikely(current->audit_context)) 1415 audit_syscall_exit(regs);
1417 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1418 1416
1419 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1417 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1420 trace_sys_exit(regs, regs->ax); 1418 trace_sys_exit(regs, regs->ax);
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index b78643d0f9a5..03920a15a632 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -553,4 +553,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
553 quirk_amd_nb_node); 553 quirk_amd_nb_node);
554DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, 554DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
555 quirk_amd_nb_node); 555 quirk_amd_nb_node);
556DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
557 quirk_amd_nb_node);
558DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
559 quirk_amd_nb_node);
560DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
561 quirk_amd_nb_node);
562DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
563 quirk_amd_nb_node);
564DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
565 quirk_amd_nb_node);
566DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
567 quirk_amd_nb_node);
568
556#endif 569#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e334be1182b9..d840e69a853c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -39,6 +39,14 @@ static int reboot_mode;
39enum reboot_type reboot_type = BOOT_ACPI; 39enum reboot_type reboot_type = BOOT_ACPI;
40int reboot_force; 40int reboot_force;
41 41
42/* This variable is used privately to keep track of whether or not
43 * reboot_type is still set to its default value (i.e., reboot= hasn't
44 * been set on the command line). This is needed so that we can
45 * suppress DMI scanning for reboot quirks. Without it, it's
46 * impossible to override a faulty reboot quirk without recompiling.
47 */
48static int reboot_default = 1;
49
42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 50#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
43static int reboot_cpu = -1; 51static int reboot_cpu = -1;
44#endif 52#endif
@@ -67,6 +75,12 @@ bool port_cf9_safe = false;
67static int __init reboot_setup(char *str) 75static int __init reboot_setup(char *str)
68{ 76{
69 for (;;) { 77 for (;;) {
78 /* Having anything passed on the command line via
79 * reboot= will cause us to disable DMI checking
80 * below.
81 */
82 reboot_default = 0;
83
70 switch (*str) { 84 switch (*str) {
71 case 'w': 85 case 'w':
72 reboot_mode = 0x1234; 86 reboot_mode = 0x1234;
@@ -124,7 +138,7 @@ __setup("reboot=", reboot_setup);
124 */ 138 */
125 139
126/* 140/*
127 * Some machines require the "reboot=b" commandline option, 141 * Some machines require the "reboot=b" or "reboot=k" commandline options,
128 * this quirk makes that automatic. 142 * this quirk makes that automatic.
129 */ 143 */
130static int __init set_bios_reboot(const struct dmi_system_id *d) 144static int __init set_bios_reboot(const struct dmi_system_id *d)
@@ -136,6 +150,15 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
136 return 0; 150 return 0;
137} 151}
138 152
153static int __init set_kbd_reboot(const struct dmi_system_id *d)
154{
155 if (reboot_type != BOOT_KBD) {
156 reboot_type = BOOT_KBD;
157 printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident);
158 }
159 return 0;
160}
161
139static struct dmi_system_id __initdata reboot_dmi_table[] = { 162static struct dmi_system_id __initdata reboot_dmi_table[] = {
140 { /* Handle problems with rebooting on Dell E520's */ 163 { /* Handle problems with rebooting on Dell E520's */
141 .callback = set_bios_reboot, 164 .callback = set_bios_reboot,
@@ -286,16 +309,8 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
286 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 309 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
287 }, 310 },
288 }, 311 },
289 { /* Handle problems with rebooting on VersaLogic Menlow boards */
290 .callback = set_bios_reboot,
291 .ident = "VersaLogic Menlow based board",
292 .matches = {
293 DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
294 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
295 },
296 },
297 { /* Handle reboot issue on Acer Aspire one */ 312 { /* Handle reboot issue on Acer Aspire one */
298 .callback = set_bios_reboot, 313 .callback = set_kbd_reboot,
299 .ident = "Acer Aspire One A110", 314 .ident = "Acer Aspire One A110",
300 .matches = { 315 .matches = {
301 DMI_MATCH(DMI_SYS_VENDOR, "Acer"), 316 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
@@ -307,7 +322,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
307 322
308static int __init reboot_init(void) 323static int __init reboot_init(void)
309{ 324{
310 dmi_check_system(reboot_dmi_table); 325 /* Only do the DMI check if reboot_type hasn't been overridden
326 * on the command line
327 */
328 if (reboot_default) {
329 dmi_check_system(reboot_dmi_table);
330 }
311 return 0; 331 return 0;
312} 332}
313core_initcall(reboot_init); 333core_initcall(reboot_init);
@@ -443,12 +463,25 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
443 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), 463 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
444 }, 464 },
445 }, 465 },
466 { /* Handle problems with rebooting on the OptiPlex 990. */
467 .callback = set_pci_reboot,
468 .ident = "Dell OptiPlex 990",
469 .matches = {
470 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
471 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
472 },
473 },
446 { } 474 { }
447}; 475};
448 476
449static int __init pci_reboot_init(void) 477static int __init pci_reboot_init(void)
450{ 478{
451 dmi_check_system(pci_reboot_dmi_table); 479 /* Only do the DMI check if reboot_type hasn't been overridden
480 * on the command line
481 */
482 if (reboot_default) {
483 dmi_check_system(pci_reboot_dmi_table);
484 }
452 return 0; 485 return 0;
453} 486}
454core_initcall(pci_reboot_init); 487core_initcall(pci_reboot_init);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 348ce016a835..af6db6ec5b2a 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -12,6 +12,7 @@
12#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
13#include <asm/x86_init.h> 13#include <asm/x86_init.h>
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/mrst.h>
15 16
16#ifdef CONFIG_X86_32 17#ifdef CONFIG_X86_32
17/* 18/*
@@ -242,6 +243,10 @@ static __init int add_rtc_cmos(void)
242 if (of_have_populated_dt()) 243 if (of_have_populated_dt())
243 return 0; 244 return 0;
244 245
246 /* Intel MID platforms don't have ioport rtc */
247 if (mrst_identify_cpu())
248 return -ENODEV;
249
245 platform_device_register(&rtc_device); 250 platform_device_register(&rtc_device);
246 dev_info(&rtc_device.dev, 251 dev_info(&rtc_device.dev,
247 "registered platform RTC device (no PNP device found)\n"); 252 "registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cf0ef986cb6d..d7d5099fe874 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -306,7 +306,8 @@ static void __init cleanup_highmap(void)
306static void __init reserve_brk(void) 306static void __init reserve_brk(void)
307{ 307{
308 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
309 memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); 309 memblock_reserve(__pa(_brk_start),
310 __pa(_brk_end) - __pa(_brk_start));
310 311
311 /* Mark brk area as locked down and no longer taking any 312 /* Mark brk area as locked down and no longer taking any
312 new allocations */ 313 new allocations */
@@ -331,13 +332,13 @@ static void __init relocate_initrd(void)
331 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 332 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
332 PAGE_SIZE); 333 PAGE_SIZE);
333 334
334 if (ramdisk_here == MEMBLOCK_ERROR) 335 if (!ramdisk_here)
335 panic("Cannot find place for new RAMDISK of size %lld\n", 336 panic("Cannot find place for new RAMDISK of size %lld\n",
336 ramdisk_size); 337 ramdisk_size);
337 338
338 /* Note: this includes all the lowmem currently occupied by 339 /* Note: this includes all the lowmem currently occupied by
339 the initrd, we rely on that fact to keep the data intact. */ 340 the initrd, we rely on that fact to keep the data intact. */
340 memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); 341 memblock_reserve(ramdisk_here, area_size);
341 initrd_start = ramdisk_here + PAGE_OFFSET; 342 initrd_start = ramdisk_here + PAGE_OFFSET;
342 initrd_end = initrd_start + ramdisk_size; 343 initrd_end = initrd_start + ramdisk_size;
343 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", 344 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -393,7 +394,7 @@ static void __init reserve_initrd(void)
393 initrd_start = 0; 394 initrd_start = 0;
394 395
395 if (ramdisk_size >= (end_of_lowmem>>1)) { 396 if (ramdisk_size >= (end_of_lowmem>>1)) {
396 memblock_x86_free_range(ramdisk_image, ramdisk_end); 397 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
397 printk(KERN_ERR "initrd too large to handle, " 398 printk(KERN_ERR "initrd too large to handle, "
398 "disabling initrd\n"); 399 "disabling initrd\n");
399 return; 400 return;
@@ -416,7 +417,7 @@ static void __init reserve_initrd(void)
416 417
417 relocate_initrd(); 418 relocate_initrd();
418 419
419 memblock_x86_free_range(ramdisk_image, ramdisk_end); 420 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
420} 421}
421#else 422#else
422static void __init reserve_initrd(void) 423static void __init reserve_initrd(void)
@@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)
490{ 491{
491 struct setup_data *data; 492 struct setup_data *data;
492 u64 pa_data; 493 u64 pa_data;
493 char buf[32];
494 494
495 if (boot_params.hdr.version < 0x0209) 495 if (boot_params.hdr.version < 0x0209)
496 return; 496 return;
497 pa_data = boot_params.hdr.setup_data; 497 pa_data = boot_params.hdr.setup_data;
498 while (pa_data) { 498 while (pa_data) {
499 data = early_memremap(pa_data, sizeof(*data)); 499 data = early_memremap(pa_data, sizeof(*data));
500 sprintf(buf, "setup data %x", data->type); 500 memblock_reserve(pa_data, sizeof(*data) + data->len);
501 memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
502 pa_data = data->next; 501 pa_data = data->next;
503 early_iounmap(data, sizeof(*data)); 502 early_iounmap(data, sizeof(*data));
504 } 503 }
@@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void)
554 crash_base = memblock_find_in_range(alignment, 553 crash_base = memblock_find_in_range(alignment,
555 CRASH_KERNEL_ADDR_MAX, crash_size, alignment); 554 CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
556 555
557 if (crash_base == MEMBLOCK_ERROR) { 556 if (!crash_base) {
558 pr_info("crashkernel reservation failed - No suitable area found.\n"); 557 pr_info("crashkernel reservation failed - No suitable area found.\n");
559 return; 558 return;
560 } 559 }
@@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void)
568 return; 567 return;
569 } 568 }
570 } 569 }
571 memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); 570 memblock_reserve(crash_base, crash_size);
572 571
573 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 572 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
574 "for crashkernel (System RAM: %ldMB)\n", 573 "for crashkernel (System RAM: %ldMB)\n",
@@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void)
626 addr = find_ibft_region(&size); 625 addr = find_ibft_region(&size);
627 626
628 if (size) 627 if (size)
629 memblock_x86_reserve_range(addr, addr + size, "* ibft"); 628 memblock_reserve(addr, size);
630} 629}
631 630
632static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; 631static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
@@ -750,12 +749,7 @@ void __init setup_arch(char **cmdline_p)
750#endif 749#endif
751#ifdef CONFIG_EFI 750#ifdef CONFIG_EFI
752 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 751 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
753#ifdef CONFIG_X86_32 752 EFI_LOADER_SIGNATURE, 4)) {
754 "EL32",
755#else
756 "EL64",
757#endif
758 4)) {
759 efi_enabled = 1; 753 efi_enabled = 1;
760 efi_memblock_x86_reserve_range(); 754 efi_memblock_x86_reserve_range();
761 } 755 }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c1..46a01bdc27e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
683 struct pt_regs *regs) 683 struct pt_regs *regs)
684{ 684{
685 sigset_t blocked;
686 int ret; 685 int ret;
687 686
688 /* Are we from a system call? */ 687 /* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
733 */ 732 */
734 regs->flags &= ~X86_EFLAGS_TF; 733 regs->flags &= ~X86_EFLAGS_TF;
735 734
736 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 735 block_sigmask(ka, sig);
737 if (!(ka->sa.sa_flags & SA_NODEFER))
738 sigaddset(&blocked, sig);
739 set_current_blocked(&blocked);
740 736
741 tracehook_signal_handler(sig, info, ka, regs, 737 tracehook_signal_handler(sig, info, ka, regs,
742 test_thread_flag(TIF_SINGLESTEP)); 738 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc15484..66c74f481cab 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
29#include <asm/mmu_context.h> 29#include <asm/mmu_context.h>
30#include <asm/proto.h> 30#include <asm/proto.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/nmi.h>
32/* 33/*
33 * Some notes on x86 processor bugs affecting SMP operation: 34 * Some notes on x86 processor bugs affecting SMP operation:
34 * 35 *
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
148 free_cpumask_var(allbutself); 149 free_cpumask_var(allbutself);
149} 150}
150 151
152static atomic_t stopping_cpu = ATOMIC_INIT(-1);
153
154static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
155{
156 /* We are registered on stopping cpu too, avoid spurious NMI */
157 if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
158 return NMI_HANDLED;
159
160 stop_this_cpu(NULL);
161
162 return NMI_HANDLED;
163}
164
165static void native_nmi_stop_other_cpus(int wait)
166{
167 unsigned long flags;
168 unsigned long timeout;
169
170 if (reboot_force)
171 return;
172
173 /*
174 * Use an own vector here because smp_call_function
175 * does lots of things not suitable in a panic situation.
176 */
177 if (num_online_cpus() > 1) {
178 /* did someone beat us here? */
179 if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
180 return;
181
182 if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
183 NMI_FLAG_FIRST, "smp_stop"))
184 /* Note: we ignore failures here */
185 return;
186
187 /* sync above data before sending NMI */
188 wmb();
189
190 apic->send_IPI_allbutself(NMI_VECTOR);
191
192 /*
193 * Don't wait longer than a second if the caller
194 * didn't ask us to wait.
195 */
196 timeout = USEC_PER_SEC;
197 while (num_online_cpus() > 1 && (wait || timeout--))
198 udelay(1);
199 }
200
201 local_irq_save(flags);
202 disable_local_APIC();
203 local_irq_restore(flags);
204}
205
151/* 206/*
152 * this function calls the 'stop' function on all other CPUs in the system. 207 * this function calls the 'stop' function on all other CPUs in the system.
153 */ 208 */
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
160 irq_exit(); 215 irq_exit();
161} 216}
162 217
163static void native_stop_other_cpus(int wait) 218static void native_irq_stop_other_cpus(int wait)
164{ 219{
165 unsigned long flags; 220 unsigned long flags;
166 unsigned long timeout; 221 unsigned long timeout;
@@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait)
194 local_irq_restore(flags); 249 local_irq_restore(flags);
195} 250}
196 251
252static void native_smp_disable_nmi_ipi(void)
253{
254 smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
255}
256
197/* 257/*
198 * Reschedule call back. 258 * Reschedule call back.
199 */ 259 */
@@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
225 irq_exit(); 285 irq_exit();
226} 286}
227 287
288static int __init nonmi_ipi_setup(char *str)
289{
290 native_smp_disable_nmi_ipi();
291 return 1;
292}
293
294__setup("nonmi_ipi", nonmi_ipi_setup);
295
228struct smp_ops smp_ops = { 296struct smp_ops smp_ops = {
229 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 297 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
230 .smp_prepare_cpus = native_smp_prepare_cpus, 298 .smp_prepare_cpus = native_smp_prepare_cpus,
231 .smp_cpus_done = native_smp_cpus_done, 299 .smp_cpus_done = native_smp_cpus_done,
232 300
233 .stop_other_cpus = native_stop_other_cpus, 301 .stop_other_cpus = native_nmi_stop_other_cpus,
234 .smp_send_reschedule = native_smp_send_reschedule, 302 .smp_send_reschedule = native_smp_send_reschedule,
235 303
236 .cpu_up = native_cpu_up, 304 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..66d250c00d11 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void)
207 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
208 */ 208 */
209 setup_vector_irq(smp_processor_id()); 209 setup_vector_irq(smp_processor_id());
210
211 /*
212 * Save our processor parameters. Note: this information
213 * is needed for clock calibration.
214 */
215 smp_store_cpu_info(cpuid);
216
210 /* 217 /*
211 * Get our bogomips. 218 * Get our bogomips.
219 * Update loops_per_jiffy in cpu_data. Previous call to
220 * smp_store_cpu_info() stored a value that is close but not as
221 * accurate as the value just calculated.
212 * 222 *
213 * Need to enable IRQs because it can take longer and then 223 * Need to enable IRQs because it can take longer and then
214 * the NMI watchdog might kill us. 224 * the NMI watchdog might kill us.
215 */ 225 */
216 local_irq_enable(); 226 local_irq_enable();
217 calibrate_delay(); 227 calibrate_delay();
228 cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
218 local_irq_disable(); 229 local_irq_disable();
219 pr_debug("Stack at about %p\n", &cpuid); 230 pr_debug("Stack at about %p\n", &cpuid);
220 231
221 /* 232 /*
222 * Save our processor parameters
223 */
224 smp_store_cpu_info(cpuid);
225
226 /*
227 * This must be done before setting cpu_online_mask 233 * This must be done before setting cpu_online_mask
228 * or calling notify_cpu_starting. 234 * or calling notify_cpu_starting.
229 */ 235 */
@@ -840,7 +846,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
840 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); 846 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
841 847
842 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 848 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
843 !physid_isset(apicid, phys_cpu_present_map)) { 849 !physid_isset(apicid, phys_cpu_present_map) ||
850 (!x2apic_mode && apicid >= 255)) {
844 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); 851 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
845 return -EINVAL; 852 return -EINVAL;
846 } 853 }
@@ -1142,6 +1149,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1142{ 1149{
1143 pr_debug("Boot done.\n"); 1150 pr_debug("Boot done.\n");
1144 1151
1152 nmi_selftest();
1145 impress_friends(); 1153 impress_friends();
1146#ifdef CONFIG_X86_IO_APIC 1154#ifdef CONFIG_X86_IO_APIC
1147 setup_ioapic_dest(); 1155 setup_ioapic_dest();
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
new file mode 100644
index 000000000000..147fcd4941c4
--- /dev/null
+++ b/arch/x86/kernel/syscall_32.c
@@ -0,0 +1,25 @@
1/* System call table for i386. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
13
14typedef asmlinkage void (*sys_call_ptr_t)(void);
15
16extern asmlinkage void sys_ni_syscall(void);
17
18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index de87d6008295..7ac7943be02c 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,15 +5,11 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __NO_STUBS 8#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
9#include <asm/syscalls_64.h>
10#undef __SYSCALL_64
9 11
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 12#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
11#undef _ASM_X86_UNISTD_64_H
12#include <asm/unistd_64.h>
13
14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_UNISTD_64_H
17 13
18typedef void (*sys_call_ptr_t)(void); 14typedef void (*sys_call_ptr_t)(void);
19 15
@@ -21,9 +17,9 @@ extern void sys_ni_syscall(void);
21 17
22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
23 /* 19 /*
24 *Smells like a like a compiler bug -- it doesn't work 20 * Smells like a compiler bug -- it doesn't work
25 *when the & below is removed. 21 * when the & below is removed.
26 */ 22 */
27 [0 ... __NR_syscall_max] = &sys_ni_syscall, 23 [0 ... __NR_syscall_max] = &sys_ni_syscall,
28#include <asm/unistd_64.h> 24#include <asm/syscalls_64.h>
29}; 25};
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
deleted file mode 100644
index 9a0e31293920..000000000000
--- a/arch/x86/kernel/syscall_table_32.S
+++ /dev/null
@@ -1,350 +0,0 @@
1ENTRY(sys_call_table)
2 .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */
3 .long sys_exit
4 .long ptregs_fork
5 .long sys_read
6 .long sys_write
7 .long sys_open /* 5 */
8 .long sys_close
9 .long sys_waitpid
10 .long sys_creat
11 .long sys_link
12 .long sys_unlink /* 10 */
13 .long ptregs_execve
14 .long sys_chdir
15 .long sys_time
16 .long sys_mknod
17 .long sys_chmod /* 15 */
18 .long sys_lchown16
19 .long sys_ni_syscall /* old break syscall holder */
20 .long sys_stat
21 .long sys_lseek
22 .long sys_getpid /* 20 */
23 .long sys_mount
24 .long sys_oldumount
25 .long sys_setuid16
26 .long sys_getuid16
27 .long sys_stime /* 25 */
28 .long sys_ptrace
29 .long sys_alarm
30 .long sys_fstat
31 .long sys_pause
32 .long sys_utime /* 30 */
33 .long sys_ni_syscall /* old stty syscall holder */
34 .long sys_ni_syscall /* old gtty syscall holder */
35 .long sys_access
36 .long sys_nice
37 .long sys_ni_syscall /* 35 - old ftime syscall holder */
38 .long sys_sync
39 .long sys_kill
40 .long sys_rename
41 .long sys_mkdir
42 .long sys_rmdir /* 40 */
43 .long sys_dup
44 .long sys_pipe
45 .long sys_times
46 .long sys_ni_syscall /* old prof syscall holder */
47 .long sys_brk /* 45 */
48 .long sys_setgid16
49 .long sys_getgid16
50 .long sys_signal
51 .long sys_geteuid16
52 .long sys_getegid16 /* 50 */
53 .long sys_acct
54 .long sys_umount /* recycled never used phys() */
55 .long sys_ni_syscall /* old lock syscall holder */
56 .long sys_ioctl
57 .long sys_fcntl /* 55 */
58 .long sys_ni_syscall /* old mpx syscall holder */
59 .long sys_setpgid
60 .long sys_ni_syscall /* old ulimit syscall holder */
61 .long sys_olduname
62 .long sys_umask /* 60 */
63 .long sys_chroot
64 .long sys_ustat
65 .long sys_dup2
66 .long sys_getppid
67 .long sys_getpgrp /* 65 */
68 .long sys_setsid
69 .long sys_sigaction
70 .long sys_sgetmask
71 .long sys_ssetmask
72 .long sys_setreuid16 /* 70 */
73 .long sys_setregid16
74 .long sys_sigsuspend
75 .long sys_sigpending
76 .long sys_sethostname
77 .long sys_setrlimit /* 75 */
78 .long sys_old_getrlimit
79 .long sys_getrusage
80 .long sys_gettimeofday
81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16
84 .long sys_old_select
85 .long sys_symlink
86 .long sys_lstat
87 .long sys_readlink /* 85 */
88 .long sys_uselib
89 .long sys_swapon
90 .long sys_reboot
91 .long sys_old_readdir
92 .long sys_old_mmap /* 90 */
93 .long sys_munmap
94 .long sys_truncate
95 .long sys_ftruncate
96 .long sys_fchmod
97 .long sys_fchown16 /* 95 */
98 .long sys_getpriority
99 .long sys_setpriority
100 .long sys_ni_syscall /* old profil syscall holder */
101 .long sys_statfs
102 .long sys_fstatfs /* 100 */
103 .long sys_ioperm
104 .long sys_socketcall
105 .long sys_syslog
106 .long sys_setitimer
107 .long sys_getitimer /* 105 */
108 .long sys_newstat
109 .long sys_newlstat
110 .long sys_newfstat
111 .long sys_uname
112 .long ptregs_iopl /* 110 */
113 .long sys_vhangup
114 .long sys_ni_syscall /* old "idle" system call */
115 .long ptregs_vm86old
116 .long sys_wait4
117 .long sys_swapoff /* 115 */
118 .long sys_sysinfo
119 .long sys_ipc
120 .long sys_fsync
121 .long ptregs_sigreturn
122 .long ptregs_clone /* 120 */
123 .long sys_setdomainname
124 .long sys_newuname
125 .long sys_modify_ldt
126 .long sys_adjtimex
127 .long sys_mprotect /* 125 */
128 .long sys_sigprocmask
129 .long sys_ni_syscall /* old "create_module" */
130 .long sys_init_module
131 .long sys_delete_module
132 .long sys_ni_syscall /* 130: old "get_kernel_syms" */
133 .long sys_quotactl
134 .long sys_getpgid
135 .long sys_fchdir
136 .long sys_bdflush
137 .long sys_sysfs /* 135 */
138 .long sys_personality
139 .long sys_ni_syscall /* reserved for afs_syscall */
140 .long sys_setfsuid16
141 .long sys_setfsgid16
142 .long sys_llseek /* 140 */
143 .long sys_getdents
144 .long sys_select
145 .long sys_flock
146 .long sys_msync
147 .long sys_readv /* 145 */
148 .long sys_writev
149 .long sys_getsid
150 .long sys_fdatasync
151 .long sys_sysctl
152 .long sys_mlock /* 150 */
153 .long sys_munlock
154 .long sys_mlockall
155 .long sys_munlockall
156 .long sys_sched_setparam
157 .long sys_sched_getparam /* 155 */
158 .long sys_sched_setscheduler
159 .long sys_sched_getscheduler
160 .long sys_sched_yield
161 .long sys_sched_get_priority_max
162 .long sys_sched_get_priority_min /* 160 */
163 .long sys_sched_rr_get_interval
164 .long sys_nanosleep
165 .long sys_mremap
166 .long sys_setresuid16
167 .long sys_getresuid16 /* 165 */
168 .long ptregs_vm86
169 .long sys_ni_syscall /* Old sys_query_module */
170 .long sys_poll
171 .long sys_ni_syscall /* Old nfsservctl */
172 .long sys_setresgid16 /* 170 */
173 .long sys_getresgid16
174 .long sys_prctl
175 .long ptregs_rt_sigreturn
176 .long sys_rt_sigaction
177 .long sys_rt_sigprocmask /* 175 */
178 .long sys_rt_sigpending
179 .long sys_rt_sigtimedwait
180 .long sys_rt_sigqueueinfo
181 .long sys_rt_sigsuspend
182 .long sys_pread64 /* 180 */
183 .long sys_pwrite64
184 .long sys_chown16
185 .long sys_getcwd
186 .long sys_capget
187 .long sys_capset /* 185 */
188 .long ptregs_sigaltstack
189 .long sys_sendfile
190 .long sys_ni_syscall /* reserved for streams1 */
191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit
194 .long sys_mmap_pgoff
195 .long sys_truncate64
196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */
198 .long sys_lstat64
199 .long sys_fstat64
200 .long sys_lchown
201 .long sys_getuid
202 .long sys_getgid /* 200 */
203 .long sys_geteuid
204 .long sys_getegid
205 .long sys_setreuid
206 .long sys_setregid
207 .long sys_getgroups /* 205 */
208 .long sys_setgroups
209 .long sys_fchown
210 .long sys_setresuid
211 .long sys_getresuid
212 .long sys_setresgid /* 210 */
213 .long sys_getresgid
214 .long sys_chown
215 .long sys_setuid
216 .long sys_setgid
217 .long sys_setfsuid /* 215 */
218 .long sys_setfsgid
219 .long sys_pivot_root
220 .long sys_mincore
221 .long sys_madvise
222 .long sys_getdents64 /* 220 */
223 .long sys_fcntl64
224 .long sys_ni_syscall /* reserved for TUX */
225 .long sys_ni_syscall
226 .long sys_gettid
227 .long sys_readahead /* 225 */
228 .long sys_setxattr
229 .long sys_lsetxattr
230 .long sys_fsetxattr
231 .long sys_getxattr
232 .long sys_lgetxattr /* 230 */
233 .long sys_fgetxattr
234 .long sys_listxattr
235 .long sys_llistxattr
236 .long sys_flistxattr
237 .long sys_removexattr /* 235 */
238 .long sys_lremovexattr
239 .long sys_fremovexattr
240 .long sys_tkill
241 .long sys_sendfile64
242 .long sys_futex /* 240 */
243 .long sys_sched_setaffinity
244 .long sys_sched_getaffinity
245 .long sys_set_thread_area
246 .long sys_get_thread_area
247 .long sys_io_setup /* 245 */
248 .long sys_io_destroy
249 .long sys_io_getevents
250 .long sys_io_submit
251 .long sys_io_cancel
252 .long sys_fadvise64 /* 250 */
253 .long sys_ni_syscall
254 .long sys_exit_group
255 .long sys_lookup_dcookie
256 .long sys_epoll_create
257 .long sys_epoll_ctl /* 255 */
258 .long sys_epoll_wait
259 .long sys_remap_file_pages
260 .long sys_set_tid_address
261 .long sys_timer_create
262 .long sys_timer_settime /* 260 */
263 .long sys_timer_gettime
264 .long sys_timer_getoverrun
265 .long sys_timer_delete
266 .long sys_clock_settime
267 .long sys_clock_gettime /* 265 */
268 .long sys_clock_getres
269 .long sys_clock_nanosleep
270 .long sys_statfs64
271 .long sys_fstatfs64
272 .long sys_tgkill /* 270 */
273 .long sys_utimes
274 .long sys_fadvise64_64
275 .long sys_ni_syscall /* sys_vserver */
276 .long sys_mbind
277 .long sys_get_mempolicy
278 .long sys_set_mempolicy
279 .long sys_mq_open
280 .long sys_mq_unlink
281 .long sys_mq_timedsend
282 .long sys_mq_timedreceive /* 280 */
283 .long sys_mq_notify
284 .long sys_mq_getsetattr
285 .long sys_kexec_load
286 .long sys_waitid
287 .long sys_ni_syscall /* 285 */ /* available */
288 .long sys_add_key
289 .long sys_request_key
290 .long sys_keyctl
291 .long sys_ioprio_set
292 .long sys_ioprio_get /* 290 */
293 .long sys_inotify_init
294 .long sys_inotify_add_watch
295 .long sys_inotify_rm_watch
296 .long sys_migrate_pages
297 .long sys_openat /* 295 */
298 .long sys_mkdirat
299 .long sys_mknodat
300 .long sys_fchownat
301 .long sys_futimesat
302 .long sys_fstatat64 /* 300 */
303 .long sys_unlinkat
304 .long sys_renameat
305 .long sys_linkat
306 .long sys_symlinkat
307 .long sys_readlinkat /* 305 */
308 .long sys_fchmodat
309 .long sys_faccessat
310 .long sys_pselect6
311 .long sys_ppoll
312 .long sys_unshare /* 310 */
313 .long sys_set_robust_list
314 .long sys_get_robust_list
315 .long sys_splice
316 .long sys_sync_file_range
317 .long sys_tee /* 315 */
318 .long sys_vmsplice
319 .long sys_move_pages
320 .long sys_getcpu
321 .long sys_epoll_pwait
322 .long sys_utimensat /* 320 */
323 .long sys_signalfd
324 .long sys_timerfd_create
325 .long sys_eventfd
326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
335 .long sys_preadv
336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open
339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */
343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */
348 .long sys_setns
349 .long sys_process_vm_readv
350 .long sys_process_vm_writev
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a91ae7709b49..a73b61055ad6 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -14,11 +14,11 @@ void __init setup_trampolines(void)
14 14
15 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
17 if (mem == MEMBLOCK_ERROR) 17 if (!mem)
18 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
19 19
20 x86_trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); 21 memblock_reserve(mem, size);
22 22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", 23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size); 24 x86_trampoline_base, (unsigned long long)mem, size);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..4bbe04d96744 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,19 +306,20 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
306 == NOTIFY_STOP) 306 == NOTIFY_STOP)
307 return; 307 return;
308#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ 308#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
309#ifdef CONFIG_KPROBES 309
310 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 310 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
311 == NOTIFY_STOP) 311 == NOTIFY_STOP)
312 return; 312 return;
313#else
314 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
315 == NOTIFY_STOP)
316 return;
317#endif
318 313
314 /*
315 * Let others (NMI) know that the debug stack is in use
316 * as we may switch to the interrupt stack.
317 */
318 debug_stack_usage_inc();
319 preempt_conditional_sti(regs); 319 preempt_conditional_sti(regs);
320 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); 320 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
321 preempt_conditional_cli(regs); 321 preempt_conditional_cli(regs);
322 debug_stack_usage_dec();
322} 323}
323 324
324#ifdef CONFIG_X86_64 325#ifdef CONFIG_X86_64
@@ -411,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
411 SIGTRAP) == NOTIFY_STOP) 412 SIGTRAP) == NOTIFY_STOP)
412 return; 413 return;
413 414
415 /*
416 * Let others (NMI) know that the debug stack is in use
417 * as we may switch to the interrupt stack.
418 */
419 debug_stack_usage_inc();
420
414 /* It's safe to allow irq's after DR6 has been saved */ 421 /* It's safe to allow irq's after DR6 has been saved */
415 preempt_conditional_sti(regs); 422 preempt_conditional_sti(regs);
416 423
@@ -418,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
418 handle_vm86_trap((struct kernel_vm86_regs *) regs, 425 handle_vm86_trap((struct kernel_vm86_regs *) regs,
419 error_code, 1); 426 error_code, 1);
420 preempt_conditional_cli(regs); 427 preempt_conditional_cli(regs);
428 debug_stack_usage_dec();
421 return; 429 return;
422 } 430 }
423 431
@@ -437,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
437 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) 445 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
438 send_sigtrap(tsk, regs, error_code, si_code); 446 send_sigtrap(tsk, regs, error_code, si_code);
439 preempt_conditional_cli(regs); 447 preempt_conditional_cli(regs);
448 debug_stack_usage_dec();
440 449
441 return; 450 return;
442} 451}
@@ -562,41 +571,18 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
562} 571}
563 572
564/* 573/*
565 * __math_state_restore assumes that cr0.TS is already clear and the
566 * fpu state is all ready for use. Used during context switch.
567 */
568void __math_state_restore(void)
569{
570 struct thread_info *thread = current_thread_info();
571 struct task_struct *tsk = thread->task;
572
573 /*
574 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
575 */
576 if (unlikely(restore_fpu_checking(tsk))) {
577 stts();
578 force_sig(SIGSEGV, tsk);
579 return;
580 }
581
582 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
583 tsk->fpu_counter++;
584}
585
586/*
587 * 'math_state_restore()' saves the current math information in the 574 * 'math_state_restore()' saves the current math information in the
588 * old math state array, and gets the new ones from the current task 575 * old math state array, and gets the new ones from the current task
589 * 576 *
590 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 577 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
591 * Don't touch unless you *really* know how it works. 578 * Don't touch unless you *really* know how it works.
592 * 579 *
593 * Must be called with kernel preemption disabled (in this case, 580 * Must be called with kernel preemption disabled (eg with local
594 * local interrupts are disabled at the call-site in entry.S). 581 * local interrupts as in the case of do_device_not_available).
595 */ 582 */
596asmlinkage void math_state_restore(void) 583void math_state_restore(void)
597{ 584{
598 struct thread_info *thread = current_thread_info(); 585 struct task_struct *tsk = current;
599 struct task_struct *tsk = thread->task;
600 586
601 if (!tsk_used_math(tsk)) { 587 if (!tsk_used_math(tsk)) {
602 local_irq_enable(); 588 local_irq_enable();
@@ -613,9 +599,17 @@ asmlinkage void math_state_restore(void)
613 local_irq_disable(); 599 local_irq_disable();
614 } 600 }
615 601
616 clts(); /* Allow maths ops (or we recurse) */ 602 __thread_fpu_begin(tsk);
603 /*
604 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
605 */
606 if (unlikely(restore_fpu_checking(tsk))) {
607 __thread_fpu_end(tsk);
608 force_sig(SIGSEGV, tsk);
609 return;
610 }
617 611
618 __math_state_restore(); 612 tsk->fpu_counter++;
619} 613}
620EXPORT_SYMBOL_GPL(math_state_restore); 614EXPORT_SYMBOL_GPL(math_state_restore);
621 615
@@ -723,4 +717,10 @@ void __init trap_init(void)
723 cpu_init(); 717 cpu_init();
724 718
725 x86_init.irqs.trap_init(); 719 x86_init.irqs.trap_init();
720
721#ifdef CONFIG_X86_64
722 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
723 set_nmi_gate(1, &debug);
724 set_nmi_gate(3, &int3);
725#endif
726} 726}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db483369f10b..a62c201c97ec 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;
35 erroneous rdtsc usage on !cpu_has_tsc processors */ 35 erroneous rdtsc usage on !cpu_has_tsc processors */
36static int __read_mostly tsc_disabled = -1; 36static int __read_mostly tsc_disabled = -1;
37 37
38static int tsc_clocksource_reliable; 38int tsc_clocksource_reliable;
39/* 39/*
40 * Scheduler clock - returns current time in nanosec units. 40 * Scheduler clock - returns current time in nanosec units.
41 */ 41 */
@@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
178} 178}
179 179
180#define CAL_MS 10 180#define CAL_MS 10
181#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) 181#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
182#define CAL_PIT_LOOPS 1000 182#define CAL_PIT_LOOPS 1000
183 183
184#define CAL2_MS 50 184#define CAL2_MS 50
185#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) 185#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
186#define CAL2_PIT_LOOPS 5000 186#define CAL2_PIT_LOOPS 5000
187 187
188 188
@@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val)
290static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) 290static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
291{ 291{
292 int count; 292 int count;
293 u64 tsc = 0; 293 u64 tsc = 0, prev_tsc = 0;
294 294
295 for (count = 0; count < 50000; count++) { 295 for (count = 0; count < 50000; count++) {
296 if (!pit_verify_msb(val)) 296 if (!pit_verify_msb(val))
297 break; 297 break;
298 prev_tsc = tsc;
298 tsc = get_cycles(); 299 tsc = get_cycles();
299 } 300 }
300 *deltap = get_cycles() - tsc; 301 *deltap = get_cycles() - prev_tsc;
301 *tscp = tsc; 302 *tscp = tsc;
302 303
303 /* 304 /*
@@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
311 * How many MSB values do we want to see? We aim for 312 * How many MSB values do we want to see? We aim for
312 * a maximum error rate of 500ppm (in practice the 313 * a maximum error rate of 500ppm (in practice the
313 * real error is much smaller), but refuse to spend 314 * real error is much smaller), but refuse to spend
314 * more than 25ms on it. 315 * more than 50ms on it.
315 */ 316 */
316#define MAX_QUICK_PIT_MS 25 317#define MAX_QUICK_PIT_MS 50
317#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 318#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
318 319
319static unsigned long quick_pit_calibrate(void) 320static unsigned long quick_pit_calibrate(void)
@@ -383,15 +384,12 @@ success:
383 * 384 *
384 * As a result, we can depend on there not being 385 * As a result, we can depend on there not being
385 * any odd delays anywhere, and the TSC reads are 386 * any odd delays anywhere, and the TSC reads are
386 * reliable (within the error). We also adjust the 387 * reliable (within the error).
387 * delta to the middle of the error bars, just
388 * because it looks nicer.
389 * 388 *
390 * kHz = ticks / time-in-seconds / 1000; 389 * kHz = ticks / time-in-seconds / 1000;
391 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 390 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
392 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) 391 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
393 */ 392 */
394 delta += (long)(d2 - d1)/2;
395 delta *= PIT_TICK_RATE; 393 delta *= PIT_TICK_RATE;
396 do_div(delta, i*256*1000); 394 do_div(delta, i*256*1000);
397 printk("Fast TSC calibration using PIT\n"); 395 printk("Fast TSC calibration using PIT\n");
@@ -995,3 +993,23 @@ void __init tsc_init(void)
995 check_system_tsc_reliable(); 993 check_system_tsc_reliable();
996} 994}
997 995
996#ifdef CONFIG_SMP
997/*
998 * If we have a constant TSC and are using the TSC for the delay loop,
999 * we can skip clock calibration if another cpu in the same socket has already
1000 * been calibrated. This assumes that CONSTANT_TSC applies to all
1001 * cpus in the socket - this should be a safe assumption.
1002 */
1003unsigned long __cpuinit calibrate_delay_is_known(void)
1004{
1005 int i, cpu = smp_processor_id();
1006
1007 if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
1008 return 0;
1009
1010 for_each_online_cpu(i)
1011 if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
1012 return cpu_data(i).loops_per_jiffy;
1013 return 0;
1014}
1015#endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0aa5fed8b9e6..9eba29b46cb7 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
113 if (unsynchronized_tsc()) 113 if (unsynchronized_tsc())
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (tsc_clocksource_reliable) {
117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info( 118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n"); 119 "Skipped synchronization checks as TSC is reliable.\n");
@@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)
172{ 172{
173 int cpus = 2; 173 int cpus = 2;
174 174
175 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) 175 if (unsynchronized_tsc() || tsc_clocksource_reliable)
176 return; 176 return;
177 177
178 /* 178 /*
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 863f8753ab0a..b466cab5ba15 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
335 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
336 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
337 337
338 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call __audit_syscall_exit since we do not exit via the normal paths */
339#ifdef CONFIG_AUDITSYSCALL
339 if (unlikely(current->audit_context)) 340 if (unlikely(current->audit_context))
340 audit_syscall_exit(AUDITSC_RESULT(0), 0); 341 __audit_syscall_exit(1, 0);
342#endif
341 343
342 __asm__ __volatile__( 344 __asm__ __volatile__(
343 "movl %0,%%esp\n\t" 345 "movl %0,%%esp\n\t"
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e4d4a22e8b94..b07ba9393564 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
57 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), 57 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
58}; 58};
59 59
60static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; 60static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
61 61
62static int __init vsyscall_setup(char *str) 62static int __init vsyscall_setup(char *str)
63{ 63{
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)
140 return nr; 140 return nr;
141} 141}
142 142
143static bool write_ok_or_segv(unsigned long ptr, size_t size)
144{
145 /*
146 * XXX: if access_ok, get_user, and put_user handled
147 * sig_on_uaccess_error, this could go away.
148 */
149
150 if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
151 siginfo_t info;
152 struct thread_struct *thread = &current->thread;
153
154 thread->error_code = 6; /* user fault, no page, write */
155 thread->cr2 = ptr;
156 thread->trap_no = 14;
157
158 memset(&info, 0, sizeof(info));
159 info.si_signo = SIGSEGV;
160 info.si_errno = 0;
161 info.si_code = SEGV_MAPERR;
162 info.si_addr = (void __user *)ptr;
163
164 force_sig_info(SIGSEGV, &info, current);
165 return false;
166 } else {
167 return true;
168 }
169}
170
143bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) 171bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
144{ 172{
145 struct task_struct *tsk; 173 struct task_struct *tsk;
146 unsigned long caller; 174 unsigned long caller;
147 int vsyscall_nr; 175 int vsyscall_nr;
176 int prev_sig_on_uaccess_error;
148 long ret; 177 long ret;
149 178
150 /* 179 /*
@@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
180 if (seccomp_mode(&tsk->seccomp)) 209 if (seccomp_mode(&tsk->seccomp))
181 do_exit(SIGKILL); 210 do_exit(SIGKILL);
182 211
212 /*
213 * With a real vsyscall, page faults cause SIGSEGV. We want to
214 * preserve that behavior to make writing exploits harder.
215 */
216 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
217 current_thread_info()->sig_on_uaccess_error = 1;
218
219 /*
220 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
221 * 64-bit, so we don't need to special-case it here. For all the
222 * vsyscalls, 0 means "don't write anything" not "write it at
223 * address 0".
224 */
225 ret = -EFAULT;
183 switch (vsyscall_nr) { 226 switch (vsyscall_nr) {
184 case 0: 227 case 0:
228 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
229 !write_ok_or_segv(regs->si, sizeof(struct timezone)))
230 break;
231
185 ret = sys_gettimeofday( 232 ret = sys_gettimeofday(
186 (struct timeval __user *)regs->di, 233 (struct timeval __user *)regs->di,
187 (struct timezone __user *)regs->si); 234 (struct timezone __user *)regs->si);
188 break; 235 break;
189 236
190 case 1: 237 case 1:
238 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
239 break;
240
191 ret = sys_time((time_t __user *)regs->di); 241 ret = sys_time((time_t __user *)regs->di);
192 break; 242 break;
193 243
194 case 2: 244 case 2:
245 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
246 !write_ok_or_segv(regs->si, sizeof(unsigned)))
247 break;
248
195 ret = sys_getcpu((unsigned __user *)regs->di, 249 ret = sys_getcpu((unsigned __user *)regs->di,
196 (unsigned __user *)regs->si, 250 (unsigned __user *)regs->si,
197 0); 251 0);
198 break; 252 break;
199 } 253 }
200 254
255 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
256
201 if (ret == -EFAULT) { 257 if (ret == -EFAULT) {
202 /* 258 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
203 * Bad news -- userspace fed a bad pointer to a vsyscall.
204 *
205 * With a real vsyscall, that would have caused SIGSEGV.
206 * To make writing reliable exploits using the emulated
207 * vsyscalls harder, generate SIGSEGV here as well.
208 */
209 warn_bad_vsyscall(KERN_INFO, regs, 259 warn_bad_vsyscall(KERN_INFO, regs,
210 "vsyscall fault (exploit attempt?)"); 260 "vsyscall fault (exploit attempt?)");
211 goto sigsegv; 261
262 /*
263 * If we failed to generate a signal for any reason,
264 * generate one here. (This should be impossible.)
265 */
266 if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
267 !sigismember(&tsk->pending.signal, SIGSEGV)))
268 goto sigsegv;
269
270 return true; /* Don't emulate the ret. */
212 } 271 }
213 272
214 regs->ax = ret; 273 regs->ax = ret;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c1d6cd549397..947a06ccc673 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
92 92
93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 93struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
94 .setup_percpu_clockev = setup_secondary_APIC_clock, 94 .setup_percpu_clockev = setup_secondary_APIC_clock,
95 .fixup_cpu_id = x86_default_fixup_cpu_id,
95}; 96};
96 97
97static void default_nmi_init(void) { }; 98static void default_nmi_init(void) { };
@@ -114,4 +115,5 @@ struct x86_msi_ops x86_msi = {
114 .setup_msi_irqs = native_setup_msi_irqs, 115 .setup_msi_irqs = native_setup_msi_irqs,
115 .teardown_msi_irq = native_teardown_msi_irq, 116 .teardown_msi_irq = native_teardown_msi_irq,
116 .teardown_msi_irqs = default_teardown_msi_irqs, 117 .teardown_msi_irqs = default_teardown_msi_irqs,
118 .restore_msi_irqs = default_restore_msi_irqs,
117}; 119};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a3911343976b..711091114119 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
47 if (!fx) 47 if (!fx)
48 return; 48 return;
49 49
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); 50 BUG_ON(__thread_has_fpu(tsk));
51 51
52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; 52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
53 53
@@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf)
168 if (!used_math()) 168 if (!used_math())
169 return 0; 169 return 0;
170 170
171 if (task_thread_info(tsk)->status & TS_USEDFPU) { 171 if (user_has_fpu()) {
172 if (use_xsave()) 172 if (use_xsave())
173 err = xsave_user(buf); 173 err = xsave_user(buf);
174 else 174 else
@@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf)
176 176
177 if (err) 177 if (err)
178 return err; 178 return err;
179 task_thread_info(tsk)->status &= ~TS_USEDFPU; 179 user_fpu_end();
180 stts();
181 } else { 180 } else {
182 sanitize_i387_state(tsk); 181 sanitize_i387_state(tsk);
183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, 182 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
@@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf)
292 return err; 291 return err;
293 } 292 }
294 293
295 if (!(task_thread_info(current)->status & TS_USEDFPU)) { 294 user_fpu_begin();
296 clts();
297 task_thread_info(current)->status |= TS_USEDFPU;
298 }
299 if (use_xsave()) 295 if (use_xsave())
300 err = restore_user_xstate(buf); 296 err = restore_user_xstate(buf);
301 else 297 else