aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@zytor.com>2010-04-29 19:53:17 -0400
committerH. Peter Anvin <hpa@zytor.com>2010-04-29 19:53:17 -0400
commitd9c5841e22231e4e49fd0a1004164e6fce59b7a6 (patch)
treee1f589c46b3ff79bbe7b1b2469f6362f94576da6 /arch/x86/kernel
parentb701a47ba48b698976fb2fe05fb285b0edc1d26a (diff)
parent5967ed87ade85a421ef814296c3c7f182b08c225 (diff)
Merge branch 'x86/asm' into x86/atomic
Merge reason: Conflict between LOCK_PREFIX_HERE and relative alternatives pointers Resolved Conflicts: arch/x86/include/asm/alternative.h arch/x86/kernel/alternative.c Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c170
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/alternative.c122
-rw-r--r--arch/x86/kernel/amd_iommu.c45
-rw-r--r--arch/x86/kernel/amd_iommu_init.c53
-rw-r--r--arch/x86/kernel/apb_timer.c785
-rw-r--r--arch/x86/kernel/aperture_64.c15
-rw-r--r--arch/x86/kernel/apic/apic.c30
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c7
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c358
-rw-r--r--arch/x86/kernel/apic/nmi.c15
-rw-r--r--arch/x86/kernel/apic/numaq_32.c3
-rw-r--r--arch/x86/kernel/apic/probe_32.c29
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c132
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c39
-rw-r--r--arch/x86/kernel/bootflag.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c4
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c621
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c252
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c5
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c208
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1968
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c422
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c980
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c159
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c13
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c3
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/crash_dump_32.c1
-rw-r--r--arch/x86/kernel/dumpstack.c14
-rw-r--r--arch/x86/kernel/dumpstack.h24
-rw-r--r--arch/x86/kernel/dumpstack_32.c5
-rw-r--r--arch/x86/kernel/dumpstack_64.c20
-rw-r--r--arch/x86/kernel/e820.c379
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/ftrace.c36
-rw-r--r--arch/x86/kernel/head32.c14
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/hpet.c20
-rw-r--r--arch/x86/kernel/hw_breakpoint.c38
-rw-r--r--arch/x86/kernel/i387.c72
-rw-r--r--arch/x86/kernel/i8259.c95
-rw-r--r--arch/x86/kernel/irqinit.c59
-rw-r--r--arch/x86/kernel/k8.c16
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c222
-rw-r--r--arch/x86/kernel/kprobes.c614
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/microcode_amd.c44
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/microcode_intel.c2
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c11
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/msr.c3
-rw-r--r--arch/x86/kernel/olpc.c10
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c16
-rw-r--r--arch/x86/kernel/pci-gart_64.c6
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/process.c52
-rw-r--r--arch/x86/kernel/process_32.c14
-rw-r--r--arch/x86/kernel/process_64.c36
-rw-r--r--arch/x86/kernel/ptrace.c68
-rw-r--r--arch/x86/kernel/quirks.c13
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/setup.c70
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c28
-rw-r--r--arch/x86/kernel/sys_i386_32.c185
-rw-r--r--arch/x86/kernel/sys_x86_64.c12
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c1
-rw-r--r--arch/x86/kernel/traps.c3
-rw-r--r--arch/x86/kernel/tsc.c6
-rw-r--r--arch/x86/kernel/uv_irq.c1
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/uv_time.c14
-rw-r--r--arch/x86/kernel/visws_quirks.c27
-rw-r--r--arch/x86/kernel/vmi_32.c36
-rw-r--r--arch/x86/kernel/vmiclock_32.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S6
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c11
-rw-r--r--arch/x86/kernel/xsave.c1
126 files changed, 6079 insertions, 3876 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d87f09bc5a52..4c58352209e0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
88 88
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o
90 91
91obj-$(CONFIG_K8_NB) += k8.o 92obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fb1035cd9a6a..cd40aba6aa95 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,10 +31,12 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/dmi.h> 32#include <linux/dmi.h>
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/slab.h>
34#include <linux/bootmem.h> 35#include <linux/bootmem.h>
35#include <linux/ioport.h> 36#include <linux/ioport.h>
36#include <linux/pci.h> 37#include <linux/pci.h>
37 38
39#include <asm/pci_x86.h>
38#include <asm/pgtable.h> 40#include <asm/pgtable.h>
39#include <asm/io_apic.h> 41#include <asm/io_apic.h>
40#include <asm/apic.h> 42#include <asm/apic.h>
@@ -49,6 +51,7 @@ EXPORT_SYMBOL(acpi_disabled);
49 51
50#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
51# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
52#endif /* X86 */ 55#endif /* X86 */
53 56
54#define BAD_MADT_ENTRY(entry, end) ( \ 57#define BAD_MADT_ENTRY(entry, end) ( \
@@ -446,6 +449,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
446int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
447{ 450{
448 *irq = gsi; 451 *irq = gsi;
452
453#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
455 setup_IO_APIC_irq_extra(gsi);
456#endif
457
449 return 0; 458 return 0;
450} 459}
451 460
@@ -473,7 +482,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
473 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
474 } 483 }
475#endif 484#endif
476 acpi_gsi_to_irq(plat_gsi, &irq); 485 irq = plat_gsi;
486
477 return irq; 487 return irq;
478} 488}
479 489
@@ -481,6 +491,26 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
481 * ACPI based hotplug support for CPU 491 * ACPI based hotplug support for CPU
482 */ 492 */
483#ifdef CONFIG_ACPI_HOTPLUG_CPU 493#ifdef CONFIG_ACPI_HOTPLUG_CPU
494#include <acpi/processor.h>
495
496static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
497{
498#ifdef CONFIG_ACPI_NUMA
499 int nid;
500
501 nid = acpi_get_node(handle);
502 if (nid == -1 || !node_online(nid))
503 return;
504#ifdef CONFIG_X86_64
505 apicid_to_node[physid] = nid;
506 numa_set_node(cpu, nid);
507#else /* CONFIG_X86_32 */
508 apicid_2_node[physid] = nid;
509 cpu_to_node_map[cpu] = nid;
510#endif
511
512#endif
513}
484 514
485static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) 515static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
486{ 516{
@@ -539,7 +569,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
539 goto free_new_map; 569 goto free_new_map;
540 } 570 }
541 571
572 acpi_processor_set_pdc(handle);
573
542 cpu = cpumask_first(new_map); 574 cpu = cpumask_first(new_map);
575 acpi_map_cpu2node(handle, cpu, physid);
543 576
544 *pcpu = cpu; 577 *pcpu = cpu;
545 retval = 0; 578 retval = 0;
@@ -1185,9 +1218,6 @@ static void __init acpi_process_madt(void)
1185 if (!error) { 1218 if (!error) {
1186 acpi_lapic = 1; 1219 acpi_lapic = 1;
1187 1220
1188#ifdef CONFIG_X86_BIGSMP
1189 generic_bigsmp_probe();
1190#endif
1191 /* 1221 /*
1192 * Parse MADT IO-APIC entries 1222 * Parse MADT IO-APIC entries
1193 */ 1223 */
@@ -1197,8 +1227,6 @@ static void __init acpi_process_madt(void)
1197 acpi_ioapic = 1; 1227 acpi_ioapic = 1;
1198 1228
1199 smp_found_config = 1; 1229 smp_found_config = 1;
1200 if (apic->setup_apic_routing)
1201 apic->setup_apic_routing();
1202 } 1230 }
1203 } 1231 }
1204 if (error == -EINVAL) { 1232 if (error == -EINVAL) {
@@ -1269,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1269} 1297}
1270 1298
1271/* 1299/*
1272 * Limit ACPI to CPU enumeration for HT
1273 */
1274static int __init force_acpi_ht(const struct dmi_system_id *d)
1275{
1276 if (!acpi_force) {
1277 printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1278 d->ident);
1279 disable_acpi();
1280 acpi_ht = 1;
1281 } else {
1282 printk(KERN_NOTICE
1283 "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1284 }
1285 return 0;
1286}
1287
1288/*
1289 * Force ignoring BIOS IRQ0 pin2 override 1300 * Force ignoring BIOS IRQ0 pin2 override
1290 */ 1301 */
1291static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1302static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
@@ -1321,90 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1321 }, 1332 },
1322 1333
1323 /* 1334 /*
1324 * Boxes that need acpi=ht
1325 */
1326 {
1327 .callback = force_acpi_ht,
1328 .ident = "FSC Primergy T850",
1329 .matches = {
1330 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1331 DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1332 },
1333 },
1334 {
1335 .callback = force_acpi_ht,
1336 .ident = "HP VISUALIZE NT Workstation",
1337 .matches = {
1338 DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1339 DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1340 },
1341 },
1342 {
1343 .callback = force_acpi_ht,
1344 .ident = "Compaq Workstation W8000",
1345 .matches = {
1346 DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1347 DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1348 },
1349 },
1350 {
1351 .callback = force_acpi_ht,
1352 .ident = "ASUS P2B-DS",
1353 .matches = {
1354 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1355 DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1356 },
1357 },
1358 {
1359 .callback = force_acpi_ht,
1360 .ident = "ASUS CUR-DLS",
1361 .matches = {
1362 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1363 DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1364 },
1365 },
1366 {
1367 .callback = force_acpi_ht,
1368 .ident = "ABIT i440BX-W83977",
1369 .matches = {
1370 DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1371 DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1372 },
1373 },
1374 {
1375 .callback = force_acpi_ht,
1376 .ident = "IBM Bladecenter",
1377 .matches = {
1378 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1379 DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1380 },
1381 },
1382 {
1383 .callback = force_acpi_ht,
1384 .ident = "IBM eServer xSeries 360",
1385 .matches = {
1386 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1387 DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1388 },
1389 },
1390 {
1391 .callback = force_acpi_ht,
1392 .ident = "IBM eserver xSeries 330",
1393 .matches = {
1394 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1395 DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1396 },
1397 },
1398 {
1399 .callback = force_acpi_ht,
1400 .ident = "IBM eserver xSeries 440",
1401 .matches = {
1402 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1403 DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1404 },
1405 },
1406
1407 /*
1408 * Boxes that need ACPI PCI IRQ routing disabled 1335 * Boxes that need ACPI PCI IRQ routing disabled
1409 */ 1336 */
1410 { 1337 {
@@ -1529,16 +1456,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1529 * if acpi_blacklisted() acpi_disabled = 1; 1456 * if acpi_blacklisted() acpi_disabled = 1;
1530 * acpi_irq_model=... 1457 * acpi_irq_model=...
1531 * ... 1458 * ...
1532 *
1533 * return value: (currently ignored)
1534 * 0: success
1535 * !0: failure
1536 */ 1459 */
1537 1460
1538int __init acpi_boot_table_init(void) 1461void __init acpi_boot_table_init(void)
1539{ 1462{
1540 int error;
1541
1542 dmi_check_system(acpi_dmi_table); 1463 dmi_check_system(acpi_dmi_table);
1543 1464
1544 /* 1465 /*
@@ -1546,15 +1467,14 @@ int __init acpi_boot_table_init(void)
1546 * One exception: acpi=ht continues far enough to enumerate LAPICs 1467 * One exception: acpi=ht continues far enough to enumerate LAPICs
1547 */ 1468 */
1548 if (acpi_disabled && !acpi_ht) 1469 if (acpi_disabled && !acpi_ht)
1549 return 1; 1470 return;
1550 1471
1551 /* 1472 /*
1552 * Initialize the ACPI boot-time table parser. 1473 * Initialize the ACPI boot-time table parser.
1553 */ 1474 */
1554 error = acpi_table_init(); 1475 if (acpi_table_init()) {
1555 if (error) {
1556 disable_acpi(); 1476 disable_acpi();
1557 return error; 1477 return;
1558 } 1478 }
1559 1479
1560 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1480 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1562,18 +1482,15 @@ int __init acpi_boot_table_init(void)
1562 /* 1482 /*
1563 * blacklist may disable ACPI entirely 1483 * blacklist may disable ACPI entirely
1564 */ 1484 */
1565 error = acpi_blacklisted(); 1485 if (acpi_blacklisted()) {
1566 if (error) {
1567 if (acpi_force) { 1486 if (acpi_force) {
1568 printk(KERN_WARNING PREFIX "acpi=force override\n"); 1487 printk(KERN_WARNING PREFIX "acpi=force override\n");
1569 } else { 1488 } else {
1570 printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); 1489 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1571 disable_acpi(); 1490 disable_acpi();
1572 return error; 1491 return;
1573 } 1492 }
1574 } 1493 }
1575
1576 return 0;
1577} 1494}
1578 1495
1579int __init early_acpi_boot_init(void) 1496int __init early_acpi_boot_init(void)
@@ -1619,6 +1536,9 @@ int __init acpi_boot_init(void)
1619 1536
1620 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); 1537 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
1621 1538
1539 if (!acpi_noirq)
1540 x86_init.pci.init = pci_acpi_init;
1541
1622 return 0; 1542 return 0;
1623} 1543}
1624 1544
@@ -1643,8 +1563,10 @@ static int __init parse_acpi(char *arg)
1643 } 1563 }
1644 /* Limit ACPI just to boot-time to enable HT */ 1564 /* Limit ACPI just to boot-time to enable HT */
1645 else if (strcmp(arg, "ht") == 0) { 1565 else if (strcmp(arg, "ht") == 0) {
1646 if (!acpi_force) 1566 if (!acpi_force) {
1567 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1647 disable_acpi(); 1568 disable_acpi();
1569 }
1648 acpi_ht = 1; 1570 acpi_ht = 1;
1649 } 1571 }
1650 /* acpi=rsdt use RSDT instead of XSDT */ 1572 /* acpi=rsdt use RSDT instead of XSDT */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 82e508677b91..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 80b222ea4cf6..70237732a6c7 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,8 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
10#include <asm/alternative.h> 12#include <asm/alternative.h>
11#include <asm/sections.h> 13#include <asm/sections.h>
12#include <asm/pgtable.h> 14#include <asm/pgtable.h>
@@ -192,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
192} 194}
193 195
194extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 196extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
195extern u8 *__smp_locks[], *__smp_locks_end[]; 197extern s32 __smp_locks[], __smp_locks_end[];
196static void *text_poke_early(void *addr, const void *opcode, size_t len); 198static void *text_poke_early(void *addr, const void *opcode, size_t len);
197 199
198/* Replace instructions with better alternatives for this CPU type. 200/* Replace instructions with better alternatives for this CPU type.
@@ -233,39 +235,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
233 235
234#ifdef CONFIG_SMP 236#ifdef CONFIG_SMP
235 237
236static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) 238static void alternatives_smp_lock(const s32 *start, const s32 *end,
239 u8 *text, u8 *text_end)
237{ 240{
238 u8 **ptr; 241 const s32 *poff;
239 242
240 mutex_lock(&text_mutex); 243 mutex_lock(&text_mutex);
241 for (ptr = start; ptr < end; ptr++) { 244 for (poff = start; poff < end; poff++) {
242 if (*ptr < text) 245 u8 *ptr = (u8 *)poff + *poff;
243 continue; 246
244 if (*ptr > text_end) 247 if (!*poff || ptr < text || ptr >= text_end)
245 continue; 248 continue;
246 /* turn DS segment override prefix into lock prefix */ 249 /* turn DS segment override prefix into lock prefix */
247 if (**ptr == 0x3e) 250 if (*ptr == 0x3e)
248 text_poke(*ptr, ((unsigned char []){0xf0}), 1); 251 text_poke(ptr, ((unsigned char []){0xf0}), 1);
249 }; 252 };
250 mutex_unlock(&text_mutex); 253 mutex_unlock(&text_mutex);
251} 254}
252 255
253static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 256static void alternatives_smp_unlock(const s32 *start, const s32 *end,
257 u8 *text, u8 *text_end)
254{ 258{
255 u8 **ptr; 259 const s32 *poff;
256 260
257 if (noreplace_smp) 261 if (noreplace_smp)
258 return; 262 return;
259 263
260 mutex_lock(&text_mutex); 264 mutex_lock(&text_mutex);
261 for (ptr = start; ptr < end; ptr++) { 265 for (poff = start; poff < end; poff++) {
262 if (*ptr < text) 266 u8 *ptr = (u8 *)poff + *poff;
263 continue; 267
264 if (*ptr > text_end) 268 if (!*poff || ptr < text || ptr >= text_end)
265 continue; 269 continue;
266 /* turn lock prefix into DS segment override prefix */ 270 /* turn lock prefix into DS segment override prefix */
267 if (**ptr == 0xf0) 271 if (*ptr == 0xf0)
268 text_poke(*ptr, ((unsigned char []){0x3E}), 1); 272 text_poke(ptr, ((unsigned char []){0x3E}), 1);
269 }; 273 };
270 mutex_unlock(&text_mutex); 274 mutex_unlock(&text_mutex);
271} 275}
@@ -276,8 +280,8 @@ struct smp_alt_module {
276 char *name; 280 char *name;
277 281
278 /* ptrs to lock prefixes */ 282 /* ptrs to lock prefixes */
279 u8 **locks; 283 const s32 *locks;
280 u8 **locks_end; 284 const s32 *locks_end;
281 285
282 /* .text segment, needed to avoid patching init code ;) */ 286 /* .text segment, needed to avoid patching init code ;) */
283 u8 *text; 287 u8 *text;
@@ -394,6 +398,27 @@ void alternatives_smp_switch(int smp)
394 mutex_unlock(&smp_alt); 398 mutex_unlock(&smp_alt);
395} 399}
396 400
401/* Return 1 if the address range is reserved for smp-alternatives */
402int alternatives_text_reserved(void *start, void *end)
403{
404 struct smp_alt_module *mod;
405 const s32 *poff;
406 u8 *text_start = start;
407 u8 *text_end = end;
408
409 list_for_each_entry(mod, &smp_alt_modules, next) {
410 if (mod->text > text_end || mod->text_end < text_start)
411 continue;
412 for (poff = mod->locks; poff < mod->locks_end; poff++) {
413 const u8 *ptr = (const u8 *)poff + *poff;
414
415 if (text_start <= ptr && text_end > ptr)
416 return 1;
417 }
418 }
419
420 return 0;
421}
397#endif 422#endif
398 423
399#ifdef CONFIG_PARAVIRT 424#ifdef CONFIG_PARAVIRT
@@ -556,3 +581,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
556 local_irq_restore(flags); 581 local_irq_restore(flags);
557 return addr; 582 return addr;
558} 583}
584
585/*
586 * Cross-modifying kernel text with stop_machine().
587 * This code originally comes from immediate value.
588 */
589static atomic_t stop_machine_first;
590static int wrote_text;
591
592struct text_poke_params {
593 void *addr;
594 const void *opcode;
595 size_t len;
596};
597
598static int __kprobes stop_machine_text_poke(void *data)
599{
600 struct text_poke_params *tpp = data;
601
602 if (atomic_dec_and_test(&stop_machine_first)) {
603 text_poke(tpp->addr, tpp->opcode, tpp->len);
604 smp_wmb(); /* Make sure other cpus see that this has run */
605 wrote_text = 1;
606 } else {
607 while (!wrote_text)
608 cpu_relax();
609 smp_mb(); /* Load wrote_text before following execution */
610 }
611
612 flush_icache_range((unsigned long)tpp->addr,
613 (unsigned long)tpp->addr + tpp->len);
614 return 0;
615}
616
617/**
618 * text_poke_smp - Update instructions on a live kernel on SMP
619 * @addr: address to modify
620 * @opcode: source of the copy
621 * @len: length to copy
622 *
623 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
624 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
625 * should be allowed, since stop_machine() does _not_ protect code against
626 * NMI and MCE.
627 *
628 * Note: Must be called under get_online_cpus() and text_mutex.
629 */
630void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
631{
632 struct text_poke_params tpp;
633
634 tpp.addr = addr;
635 tpp.opcode = opcode;
636 tpp.len = len;
637 atomic_set(&stop_machine_first, 1);
638 wrote_text = 0;
639 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
640 return addr;
641}
642
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 23824fef789c..f854d89b7edf 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,8 +18,8 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h>
22#include <linux/bitmap.h> 21#include <linux/bitmap.h>
22#include <linux/slab.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -118,7 +118,7 @@ static bool check_device(struct device *dev)
118 return false; 118 return false;
119 119
120 /* No device or no PCI device */ 120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type) 121 if (dev->bus != &pci_bus_type)
122 return false; 122 return false;
123 123
124 devid = get_device_id(dev); 124 devid = get_device_id(dev);
@@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
392 u32 tail, head; 392 u32 tail, head;
393 u8 *target; 393 u8 *target;
394 394
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
395 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
396 target = iommu->cmd_buf + tail; 397 target = iommu->cmd_buf + tail;
397 memcpy_toio(target, cmd, sizeof(*cmd)); 398 memcpy_toio(target, cmd, sizeof(*cmd));
@@ -980,7 +981,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
980{ 981{
981 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 982 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
982 struct amd_iommu *iommu; 983 struct amd_iommu *iommu;
983 int i; 984 unsigned long i;
984 985
985#ifdef CONFIG_IOMMU_STRESS 986#ifdef CONFIG_IOMMU_STRESS
986 populate = false; 987 populate = false;
@@ -1489,11 +1490,14 @@ static void __detach_device(struct device *dev)
1489{ 1490{
1490 struct iommu_dev_data *dev_data = get_dev_data(dev); 1491 struct iommu_dev_data *dev_data = get_dev_data(dev);
1491 struct iommu_dev_data *alias_data; 1492 struct iommu_dev_data *alias_data;
1493 struct protection_domain *domain;
1492 unsigned long flags; 1494 unsigned long flags;
1493 1495
1494 BUG_ON(!dev_data->domain); 1496 BUG_ON(!dev_data->domain);
1495 1497
1496 spin_lock_irqsave(&dev_data->domain->lock, flags); 1498 domain = dev_data->domain;
1499
1500 spin_lock_irqsave(&domain->lock, flags);
1497 1501
1498 if (dev_data->alias != dev) { 1502 if (dev_data->alias != dev) {
1499 alias_data = get_dev_data(dev_data->alias); 1503 alias_data = get_dev_data(dev_data->alias);
@@ -1504,13 +1508,15 @@ static void __detach_device(struct device *dev)
1504 if (atomic_dec_and_test(&dev_data->bind)) 1508 if (atomic_dec_and_test(&dev_data->bind))
1505 do_detach(dev); 1509 do_detach(dev);
1506 1510
1507 spin_unlock_irqrestore(&dev_data->domain->lock, flags); 1511 spin_unlock_irqrestore(&domain->lock, flags);
1508 1512
1509 /* 1513 /*
1510 * If we run in passthrough mode the device must be assigned to the 1514 * If we run in passthrough mode the device must be assigned to the
1511 * passthrough domain if it is detached from any other domain 1515 * passthrough domain if it is detached from any other domain.
1516 * Make sure we can deassign from the pt_domain itself.
1512 */ 1517 */
1513 if (iommu_pass_through && dev_data->domain == NULL) 1518 if (iommu_pass_through &&
1519 (dev_data->domain == NULL && domain != pt_domain))
1514 __attach_device(dev, pt_domain); 1520 __attach_device(dev, pt_domain);
1515} 1521}
1516 1522
@@ -2181,7 +2187,7 @@ static void prealloc_protection_domains(void)
2181 struct dma_ops_domain *dma_dom; 2187 struct dma_ops_domain *dma_dom;
2182 u16 devid; 2188 u16 devid;
2183 2189
2184 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2190 for_each_pci_dev(dev) {
2185 2191
2186 /* Do we handle this device? */ 2192 /* Do we handle this device? */
2187 if (!check_device(&dev->dev)) 2193 if (!check_device(&dev->dev))
@@ -2218,6 +2224,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2218/* 2224/*
2219 * The function which clues the AMD IOMMU driver into dma_ops. 2225 * The function which clues the AMD IOMMU driver into dma_ops.
2220 */ 2226 */
2227
2228void __init amd_iommu_init_api(void)
2229{
2230 register_iommu(&amd_iommu_ops);
2231}
2232
2221int __init amd_iommu_init_dma_ops(void) 2233int __init amd_iommu_init_dma_ops(void)
2222{ 2234{
2223 struct amd_iommu *iommu; 2235 struct amd_iommu *iommu;
@@ -2253,8 +2265,6 @@ int __init amd_iommu_init_dma_ops(void)
2253 /* Make the driver finally visible to the drivers */ 2265 /* Make the driver finally visible to the drivers */
2254 dma_ops = &amd_iommu_dma_ops; 2266 dma_ops = &amd_iommu_dma_ops;
2255 2267
2256 register_iommu(&amd_iommu_ops);
2257
2258 amd_iommu_stats_init(); 2268 amd_iommu_stats_init();
2259 2269
2260 return 0; 2270 return 0;
@@ -2289,7 +2299,7 @@ static void cleanup_domain(struct protection_domain *domain)
2289 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { 2299 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2290 struct device *dev = dev_data->dev; 2300 struct device *dev = dev_data->dev;
2291 2301
2292 do_detach(dev); 2302 __detach_device(dev);
2293 atomic_set(&dev_data->bind, 0); 2303 atomic_set(&dev_data->bind, 0);
2294 } 2304 }
2295 2305
@@ -2318,6 +2328,7 @@ static struct protection_domain *protection_domain_alloc(void)
2318 return NULL; 2328 return NULL;
2319 2329
2320 spin_lock_init(&domain->lock); 2330 spin_lock_init(&domain->lock);
2331 mutex_init(&domain->api_lock);
2321 domain->id = domain_id_alloc(); 2332 domain->id = domain_id_alloc();
2322 if (!domain->id) 2333 if (!domain->id)
2323 goto out_err; 2334 goto out_err;
@@ -2370,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2370 2381
2371 free_pagetable(domain); 2382 free_pagetable(domain);
2372 2383
2373 domain_id_free(domain->id); 2384 protection_domain_free(domain);
2374
2375 kfree(domain);
2376 2385
2377 dom->priv = NULL; 2386 dom->priv = NULL;
2378} 2387}
@@ -2447,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2447 iova &= PAGE_MASK; 2456 iova &= PAGE_MASK;
2448 paddr &= PAGE_MASK; 2457 paddr &= PAGE_MASK;
2449 2458
2459 mutex_lock(&domain->api_lock);
2460
2450 for (i = 0; i < npages; ++i) { 2461 for (i = 0; i < npages; ++i) {
2451 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); 2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2452 if (ret) 2463 if (ret)
@@ -2456,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2456 paddr += PAGE_SIZE; 2467 paddr += PAGE_SIZE;
2457 } 2468 }
2458 2469
2470 mutex_unlock(&domain->api_lock);
2471
2459 return 0; 2472 return 0;
2460} 2473}
2461 2474
@@ -2468,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2468 2481
2469 iova &= PAGE_MASK; 2482 iova &= PAGE_MASK;
2470 2483
2484 mutex_lock(&domain->api_lock);
2485
2471 for (i = 0; i < npages; ++i) { 2486 for (i = 0; i < npages; ++i) {
2472 iommu_unmap_page(domain, iova, PM_MAP_4k); 2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2473 iova += PAGE_SIZE; 2488 iova += PAGE_SIZE;
2474 } 2489 }
2475 2490
2476 iommu_flush_tlb_pde(domain); 2491 iommu_flush_tlb_pde(domain);
2492
2493 mutex_unlock(&domain->api_lock);
2477} 2494}
2478 2495
2479static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 1dca9c34eaeb..6360abf993d4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -19,8 +19,8 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/gfp.h>
23#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
@@ -138,6 +138,11 @@ int amd_iommus_present;
138bool amd_iommu_np_cache __read_mostly; 138bool amd_iommu_np_cache __read_mostly;
139 139
140/* 140/*
141 * The ACPI table parsing functions set this variable on an error
142 */
143static int __initdata amd_iommu_init_err;
144
145/*
141 * List of protection domains - used during resume 146 * List of protection domains - used during resume
142 */ 147 */
143LIST_HEAD(amd_iommu_pd_list); 148LIST_HEAD(amd_iommu_pd_list);
@@ -386,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
386 */ 391 */
387 for (i = 0; i < table->length; ++i) 392 for (i = 0; i < table->length; ++i)
388 checksum += p[i]; 393 checksum += p[i];
389 if (checksum != 0) 394 if (checksum != 0) {
390 /* ACPI table corrupt */ 395 /* ACPI table corrupt */
391 return -ENODEV; 396 amd_iommu_init_err = -ENODEV;
397 return 0;
398 }
392 399
393 p += IVRS_HEADER_LENGTH; 400 p += IVRS_HEADER_LENGTH;
394 401
@@ -431,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
431 if (cmd_buf == NULL) 438 if (cmd_buf == NULL)
432 return NULL; 439 return NULL;
433 440
434 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 441 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
435 442
436 return cmd_buf; 443 return cmd_buf;
437} 444}
@@ -467,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
467 &entry, sizeof(entry)); 474 &entry, sizeof(entry));
468 475
469 amd_iommu_reset_cmd_buffer(iommu); 476 amd_iommu_reset_cmd_buffer(iommu);
477 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
470} 478}
471 479
472static void __init free_command_buffer(struct amd_iommu *iommu) 480static void __init free_command_buffer(struct amd_iommu *iommu)
473{ 481{
474 free_pages((unsigned long)iommu->cmd_buf, 482 free_pages((unsigned long)iommu->cmd_buf,
475 get_order(iommu->cmd_buf_size)); 483 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
476} 484}
477 485
478/* allocates the memory where the IOMMU will log its events to */ 486/* allocates the memory where the IOMMU will log its events to */
@@ -915,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
915 h->mmio_phys); 923 h->mmio_phys);
916 924
917 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 925 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
918 if (iommu == NULL) 926 if (iommu == NULL) {
919 return -ENOMEM; 927 amd_iommu_init_err = -ENOMEM;
928 return 0;
929 }
930
920 ret = init_iommu_one(iommu, h); 931 ret = init_iommu_one(iommu, h);
921 if (ret) 932 if (ret) {
922 return ret; 933 amd_iommu_init_err = ret;
934 return 0;
935 }
923 break; 936 break;
924 default: 937 default:
925 break; 938 break;
@@ -1204,6 +1217,10 @@ static int __init amd_iommu_init(void)
1204 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1217 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1205 return -ENODEV; 1218 return -ENODEV;
1206 1219
1220 ret = amd_iommu_init_err;
1221 if (ret)
1222 goto out;
1223
1207 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); 1224 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1208 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); 1225 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1209 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); 1226 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1263,9 +1280,19 @@ static int __init amd_iommu_init(void)
1263 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1280 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1264 goto free; 1281 goto free;
1265 1282
1283 if (amd_iommu_init_err) {
1284 ret = amd_iommu_init_err;
1285 goto free;
1286 }
1287
1266 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1288 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1267 goto free; 1289 goto free;
1268 1290
1291 if (amd_iommu_init_err) {
1292 ret = amd_iommu_init_err;
1293 goto free;
1294 }
1295
1269 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1296 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1270 if (ret) 1297 if (ret)
1271 goto free; 1298 goto free;
@@ -1278,16 +1305,19 @@ static int __init amd_iommu_init(void)
1278 if (ret) 1305 if (ret)
1279 goto free; 1306 goto free;
1280 1307
1308 enable_iommus();
1309
1281 if (iommu_pass_through) 1310 if (iommu_pass_through)
1282 ret = amd_iommu_init_passthrough(); 1311 ret = amd_iommu_init_passthrough();
1283 else 1312 else
1284 ret = amd_iommu_init_dma_ops(); 1313 ret = amd_iommu_init_dma_ops();
1314
1285 if (ret) 1315 if (ret)
1286 goto free; 1316 goto free;
1287 1317
1288 amd_iommu_init_notifier(); 1318 amd_iommu_init_api();
1289 1319
1290 enable_iommus(); 1320 amd_iommu_init_notifier();
1291 1321
1292 if (iommu_pass_through) 1322 if (iommu_pass_through)
1293 goto out; 1323 goto out;
@@ -1302,6 +1332,7 @@ out:
1302 return ret; 1332 return ret;
1303 1333
1304free: 1334free:
1335 disable_iommus();
1305 1336
1306 amd_iommu_uninit_devices(); 1337 amd_iommu_uninit_devices();
1307 1338
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 000000000000..ff469e470059
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,785 @@
1/*
2 * apb_timer.c: Driver for Langwell APB timers
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 * Langwell is the south complex of Intel Moorestown MID platform. There are
14 * eight external timers in total that can be used by the operating system.
15 * The timer information, such as frequency and addresses, is provided to the
16 * OS via SFI tables.
17 * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
18 * individual redirection table entries (RTE).
19 * Unlike HPET, there is no master counter, therefore one of the timers are
20 * used as clocksource. The overall allocation looks like:
21 * - timer 0 - NR_CPUs for per cpu timer
22 * - one timer for clocksource
23 * - one timer for watchdog driver.
24 * It is also worth notice that APB timer does not support true one-shot mode,
25 * free-running mode will be used here to emulate one-shot mode.
26 * APB timer can also be used as broadcast timer along with per cpu local APIC
27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */
29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h>
33#include <linux/errno.h>
34#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/slab.h>
37#include <linux/pm.h>
38#include <linux/pci.h>
39#include <linux/sfi.h>
40#include <linux/interrupt.h>
41#include <linux/cpu.h>
42#include <linux/irq.h>
43
44#include <asm/fixmap.h>
45#include <asm/apb_timer.h>
46
47#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150
50#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200
52
53#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
54#define APBT_CLOCKEVENT0_NUM (0)
55#define APBT_CLOCKEVENT1_NUM (1)
56#define APBT_CLOCKSOURCE_NUM (2)
57
58static unsigned long apbt_address;
59static int apb_timer_block_enabled;
60static void __iomem *apbt_virt_address;
61static int phy_cs_timer_id;
62
63/*
64 * Common DW APB timer info
65 */
66static uint64_t apbt_freq;
67
68static void apbt_set_mode(enum clock_event_mode mode,
69 struct clock_event_device *evt);
70static int apbt_next_event(unsigned long delta,
71 struct clock_event_device *evt);
72static cycle_t apbt_read_clocksource(struct clocksource *cs);
73static void apbt_restart_clocksource(struct clocksource *cs);
74
75struct apbt_dev {
76 struct clock_event_device evt;
77 unsigned int num;
78 int cpu;
79 unsigned int irq;
80 unsigned int tick;
81 unsigned int count;
82 unsigned int flags;
83 char name[10];
84};
85
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89
90#ifdef CONFIG_SMP
91static unsigned int apbt_num_timers_used;
92static struct apbt_dev *apbt_devs;
93#endif
94
95static inline unsigned long apbt_readl_reg(unsigned long a)
96{
97 return readl(apbt_virt_address + a);
98}
99
100static inline void apbt_writel_reg(unsigned long d, unsigned long a)
101{
102 writel(d, apbt_virt_address + a);
103}
104
105static inline unsigned long apbt_readl(int n, unsigned long a)
106{
107 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
108}
109
110static inline void apbt_writel(int n, unsigned long d, unsigned long a)
111{
112 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
113}
114
115static inline void apbt_set_mapping(void)
116{
117 struct sfi_timer_table_entry *mtmr;
118
119 if (apbt_virt_address) {
120 pr_debug("APBT base already mapped\n");
121 return;
122 }
123 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
124 if (mtmr == NULL) {
125 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
126 APBT_CLOCKEVENT0_NUM);
127 return;
128 }
129 apbt_address = (unsigned long)mtmr->phys_addr;
130 if (!apbt_address) {
131 printk(KERN_WARNING "No timer base from SFI, use default\n");
132 apbt_address = APBT_DEFAULT_BASE;
133 }
134 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
135 if (apbt_virt_address) {
136 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
137 (void *)apbt_address, (void *)apbt_virt_address);
138 } else {
139 pr_debug("Failed mapping APBT phy address at %p\n",\
140 (void *)apbt_address);
141 goto panic_noapbt;
142 }
143 apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
144 sfi_free_mtmr(mtmr);
145
146 /* Now figure out the physical timer id for clocksource device */
147 mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
148 if (mtmr == NULL)
149 goto panic_noapbt;
150
151 /* Now figure out the physical timer id */
152 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
153 / APBTMRS_REG_SIZE;
154 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
155 return;
156
157panic_noapbt:
158 panic("Failed to setup APB system timer\n");
159
160}
161
162static inline void apbt_clear_mapping(void)
163{
164 iounmap(apbt_virt_address);
165 apbt_virt_address = NULL;
166}
167
168/*
169 * APBT timer interrupt enable / disable
170 */
171static inline int is_apbt_capable(void)
172{
173 return apbt_virt_address ? 1 : 0;
174}
175
176static struct clocksource clocksource_apbt = {
177 .name = "apbt",
178 .rating = APBT_CLOCKSOURCE_RATING,
179 .read = apbt_read_clocksource,
180 .mask = APBT_MASK,
181 .shift = APBT_SHIFT,
182 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
183 .resume = apbt_restart_clocksource,
184};
185
186/* boot APB clock event device */
187static struct clock_event_device apbt_clockevent = {
188 .name = "apbt0",
189 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
190 .set_mode = apbt_set_mode,
191 .set_next_event = apbt_next_event,
192 .shift = APBT_SHIFT,
193 .irq = 0,
194 .rating = APBT_CLOCKEVENT_RATING,
195};
196
197/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0.
223 */
224static void apbt_start_counter(int n)
225{
226 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
227
228 ctrl &= ~APBTMR_CONTROL_ENABLE;
229 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
230 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
231 /* enable, mask interrupt */
232 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
233 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
234 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
235 /* read it once to get cached counter value initialized */
236 apbt_read_clocksource(&clocksource_apbt);
237}
238
239static irqreturn_t apbt_interrupt_handler(int irq, void *data)
240{
241 struct apbt_dev *dev = (struct apbt_dev *)data;
242 struct clock_event_device *aevt = &dev->evt;
243
244 if (!aevt->event_handler) {
245 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
246 dev->num);
247 return IRQ_NONE;
248 }
249 aevt->event_handler(aevt);
250 return IRQ_HANDLED;
251}
252
253static void apbt_restart_clocksource(struct clocksource *cs)
254{
255 apbt_start_counter(phy_cs_timer_id);
256}
257
258/* Setup IRQ routing via IOAPIC */
259#ifdef CONFIG_SMP
260static void apbt_setup_irq(struct apbt_dev *adev)
261{
262 struct irq_chip *chip;
263 struct irq_desc *desc;
264
265 /* timer0 irq has been setup early */
266 if (adev->irq == 0)
267 return;
268 desc = irq_to_desc(adev->irq);
269 chip = get_irq_chip(adev->irq);
270 disable_irq(adev->irq);
271 desc->status |= IRQ_MOVE_PCNTXT;
272 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
273 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
274 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
275 enable_irq(adev->irq);
276 if (system_state == SYSTEM_BOOTING)
277 if (request_irq(adev->irq, apbt_interrupt_handler,
278 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
279 adev->name, adev)) {
280 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
281 adev->num);
282 }
283}
284#endif
285
286static void apbt_enable_int(int n)
287{
288 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
289 /* clear pending intr */
290 apbt_readl(n, APBTMR_N_EOI);
291 ctrl &= ~APBTMR_CONTROL_INT;
292 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
293}
294
295static void apbt_disable_int(int n)
296{
297 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
298
299 ctrl |= APBTMR_CONTROL_INT;
300 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
301}
302
303
304static int __init apbt_clockevent_register(void)
305{
306 struct sfi_timer_table_entry *mtmr;
307 struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
308
309 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
310 if (mtmr == NULL) {
311 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
312 APBT_CLOCKEVENT0_NUM);
313 return -ENODEV;
314 }
315
316 /*
317 * We need to calculate the scaled math multiplication factor for
318 * nanosecond to apbt tick conversion.
319 * mult = (nsec/cycle)*2^APBT_SHIFT
320 */
321 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
322 , NSEC_PER_SEC, APBT_SHIFT);
323
324 /* Calculate the min / max delta */
325 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
326 &apbt_clockevent);
327 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
328 APBT_MIN_DELTA_USEC*apbt_freq,
329 &apbt_clockevent);
330 /*
331 * Start apbt with the boot cpu mask and make it
332 * global if not used for per cpu timer.
333 */
334 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
335 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337
338 if (disable_apbt_percpu) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n",
342 global_clock_event->name);
343 }
344
345 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
346 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
347 apbt_clockevent.name, adev)) {
348 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
349 apbt_clockevent.irq);
350 }
351
352 clockevents_register_device(&adev->evt);
353 /* Start APBT 0 interrupts */
354 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
355
356 sfi_free_mtmr(mtmr);
357 return 0;
358}
359
360#ifdef CONFIG_SMP
361/* Should be called with per cpu */
362void apbt_setup_secondary_clock(void)
363{
364 struct apbt_dev *adev;
365 struct clock_event_device *aevt;
366 int cpu;
367
368 /* Don't register boot CPU clockevent */
369 cpu = smp_processor_id();
370 if (cpu == boot_cpu_id)
371 return;
372 /*
373 * We need to calculate the scaled math multiplication factor for
374 * nanosecond to apbt tick conversion.
375 * mult = (nsec/cycle)*2^APBT_SHIFT
376 */
377 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
378 adev = &per_cpu(cpu_apbt_dev, cpu);
379 aevt = &adev->evt;
380
381 memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
382 aevt->cpumask = cpumask_of(cpu);
383 aevt->name = adev->name;
384 aevt->mode = CLOCK_EVT_MODE_UNUSED;
385
386 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
387 cpu, aevt->name, *(u32 *)aevt->cpumask);
388
389 apbt_setup_irq(adev);
390
391 clockevents_register_device(aevt);
392
393 apbt_enable_int(cpu);
394
395 return;
396}
397
398/*
399 * this notify handler process CPU hotplug events. in case of S0i3, nonboot
400 * cpus are disabled/enabled frequently, for performance reasons, we keep the
401 * per cpu timer irq registered so that we do need to do free_irq/request_irq.
402 *
403 * TODO: it might be more reliable to directly disable percpu clockevent device
404 * without the notifier chain. currently, cpu 0 may get interrupts from other
405 * cpu timers during the offline process due to the ordering of notification.
406 * the extra interrupt is harmless.
407 */
408static int apbt_cpuhp_notify(struct notifier_block *n,
409 unsigned long action, void *hcpu)
410{
411 unsigned long cpu = (unsigned long)hcpu;
412 struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
413
414 switch (action & 0xf) {
415 case CPU_DEAD:
416 apbt_disable_int(cpu);
417 if (system_state == SYSTEM_RUNNING)
418 pr_debug("skipping APBT CPU %lu offline\n", cpu);
419 else if (adev) {
420 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
421 free_irq(adev->irq, adev);
422 }
423 break;
424 default:
425 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
426 }
427 return NOTIFY_OK;
428}
429
430static __init int apbt_late_init(void)
431{
432 if (disable_apbt_percpu)
433 return 0;
434 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20);
436 return 0;
437}
438fs_initcall(apbt_late_init);
439#else
440
441void apbt_setup_secondary_clock(void) {}
442
443#endif /* CONFIG_SMP */
444
445static void apbt_set_mode(enum clock_event_mode mode,
446 struct clock_event_device *evt)
447{
448 unsigned long ctrl;
449 uint64_t delta;
450 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452
453 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode);
456
457 switch (mode) {
458 case CLOCK_EVT_MODE_PERIODIC:
459 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
460 delta >>= apbt_clockevent.shift;
461 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
462 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
463 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
464 /*
465 * DW APB p. 46, have to disable timer before load counter,
466 * may cause sync problem.
467 */
468 ctrl &= ~APBTMR_CONTROL_ENABLE;
469 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
470 udelay(1);
471 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
472 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
473 ctrl |= APBTMR_CONTROL_ENABLE;
474 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
475 break;
476 /* APB timer does not have one-shot mode, use free running mode */
477 case CLOCK_EVT_MODE_ONESHOT:
478 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
479 /*
480 * set free running mode, this mode will let timer reload max
481 * timeout which will give time (3min on 25MHz clock) to rearm
482 * the next event, therefore emulate the one-shot mode.
483 */
484 ctrl &= ~APBTMR_CONTROL_ENABLE;
485 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
486
487 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
488 /* write again to set free running mode */
489 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
490
491 /*
492 * DW APB p. 46, load counter with all 1s before starting free
493 * running mode.
494 */
495 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
496 ctrl &= ~APBTMR_CONTROL_INT;
497 ctrl |= APBTMR_CONTROL_ENABLE;
498 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
499 break;
500
501 case CLOCK_EVT_MODE_UNUSED:
502 case CLOCK_EVT_MODE_SHUTDOWN:
503 apbt_disable_int(timer_num);
504 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
505 ctrl &= ~APBTMR_CONTROL_ENABLE;
506 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
507 break;
508
509 case CLOCK_EVT_MODE_RESUME:
510 apbt_enable_int(timer_num);
511 break;
512 }
513}
514
515static int apbt_next_event(unsigned long delta,
516 struct clock_event_device *evt)
517{
518 unsigned long ctrl;
519 int timer_num;
520
521 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
522
523 timer_num = adev->num;
524 /* Disable timer */
525 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
526 ctrl &= ~APBTMR_CONTROL_ENABLE;
527 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
528 /* write new count */
529 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
530 ctrl |= APBTMR_CONTROL_ENABLE;
531 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
532 return 0;
533}
534
535/*
536 * APB timer clock is not in sync with pclk on Langwell, which translates to
537 * unreliable read value caused by sampling error. the error does not add up
538 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
539 * would go backwards. the following code is trying to prevent time traveling
540 * backwards. little bit paranoid.
541 */
542static cycle_t apbt_read_clocksource(struct clocksource *cs)
543{
544 unsigned long t0, t1, t2;
545 static unsigned long last_read;
546
547bad_count:
548 t1 = apbt_readl(phy_cs_timer_id,
549 APBTMR_N_CURRENT_VALUE);
550 t2 = apbt_readl(phy_cs_timer_id,
551 APBTMR_N_CURRENT_VALUE);
552 if (unlikely(t1 < t2)) {
553 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
554 t1, t2, t2 - t1);
555 goto bad_count;
556 }
557 /*
558 * check against cached last read, makes sure time does not go back.
559 * it could be a normal rollover but we will do tripple check anyway
560 */
561 if (unlikely(t2 > last_read)) {
562 /* check if we have a normal rollover */
563 unsigned long raw_intr_status =
564 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
565 /*
566 * cs timer interrupt is masked but raw intr bit is set if
567 * rollover occurs. then we read EOI reg to clear it.
568 */
569 if (raw_intr_status & (1 << phy_cs_timer_id)) {
570 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
571 goto out;
572 }
573 pr_debug("APB CS going back %lx:%lx:%lx ",
574 t2, last_read, t2 - last_read);
575bad_count_x3:
576 pr_debug(KERN_INFO "tripple check enforced\n");
577 t0 = apbt_readl(phy_cs_timer_id,
578 APBTMR_N_CURRENT_VALUE);
579 udelay(1);
580 t1 = apbt_readl(phy_cs_timer_id,
581 APBTMR_N_CURRENT_VALUE);
582 udelay(1);
583 t2 = apbt_readl(phy_cs_timer_id,
584 APBTMR_N_CURRENT_VALUE);
585 if ((t2 > t1) || (t1 > t0)) {
586 printk(KERN_ERR "Error: APB CS tripple check failed\n");
587 goto bad_count_x3;
588 }
589 }
590out:
591 last_read = t2;
592 return (cycle_t)~t2;
593}
594
595static int apbt_clocksource_register(void)
596{
597 u64 start, now;
598 cycle_t t1;
599
600 /* Start the counter, use timer 2 as source, timer 0/1 for event */
601 apbt_start_counter(phy_cs_timer_id);
602
603 /* Verify whether apbt counter works */
604 t1 = apbt_read_clocksource(&clocksource_apbt);
605 rdtscll(start);
606
607 /*
608 * We don't know the TSC frequency yet, but waiting for
609 * 200000 TSC cycles is safe:
610 * 4 GHz == 50us
611 * 1 GHz == 200us
612 */
613 do {
614 rep_nop();
615 rdtscll(now);
616 } while ((now - start) < 200000UL);
617
618 /* APBT is the only always on clocksource, it has to work! */
619 if (t1 == apbt_read_clocksource(&clocksource_apbt))
620 panic("APBT counter not counting. APBT disabled\n");
621
622 /*
623 * initialize and register APBT clocksource
624 * convert that to ns/clock cycle
625 * mult = (ns/c) * 2^APBT_SHIFT
626 */
627 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
628 (unsigned long) apbt_freq, APBT_SHIFT);
629 clocksource_register(&clocksource_apbt);
630
631 return 0;
632}
633
634/*
635 * Early setup the APBT timer, only use timer 0 for booting then switch to
636 * per CPU timer if possible.
637 * returns 1 if per cpu apbt is setup
638 * returns 0 if no per cpu apbt is chosen
639 * panic if set up failed, this is the only platform timer on Moorestown.
640 */
641void __init apbt_time_init(void)
642{
643#ifdef CONFIG_SMP
644 int i;
645 struct sfi_timer_table_entry *p_mtmr;
646 unsigned int percpu_timer;
647 struct apbt_dev *adev;
648#endif
649
650 if (apb_timer_block_enabled)
651 return;
652 apbt_set_mapping();
653 if (apbt_virt_address) {
654 pr_debug("Found APBT version 0x%lx\n",\
655 apbt_readl_reg(APBTMRS_COMP_VERSION));
656 } else
657 goto out_noapbt;
658 /*
659 * Read the frequency and check for a sane value, for ESL model
660 * we extend the possible clock range to allow time scaling.
661 */
662
663 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
664 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
665 goto out_noapbt;
666 }
667 if (apbt_clocksource_register()) {
668 pr_debug("APBT has failed to register clocksource\n");
669 goto out_noapbt;
670 }
671 if (!apbt_clockevent_register())
672 apb_timer_block_enabled = 1;
673 else {
674 pr_debug("APBT has failed to register clockevent\n");
675 goto out_noapbt;
676 }
677#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return;
682 }
683 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
684 if (num_possible_cpus() <= sfi_mtimer_num) {
685 percpu_timer = 1;
686 apbt_num_timers_used = num_possible_cpus();
687 } else {
688 percpu_timer = 0;
689 apbt_num_timers_used = 1;
690 adev = &per_cpu(cpu_apbt_dev, 0);
691 adev->flags &= ~APBT_DEV_USED;
692 }
693 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
694
695 /* here we set up per CPU timer data structure */
696 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
697 GFP_KERNEL);
698 if (!apbt_devs) {
699 printk(KERN_ERR "Failed to allocate APB timer devices\n");
700 return;
701 }
702 for (i = 0; i < apbt_num_timers_used; i++) {
703 adev = &per_cpu(cpu_apbt_dev, i);
704 adev->num = i;
705 adev->cpu = i;
706 p_mtmr = sfi_get_mtmr(i);
707 if (p_mtmr) {
708 adev->tick = p_mtmr->freq_hz;
709 adev->irq = p_mtmr->irq;
710 } else
711 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
712 adev->count = 0;
713 sprintf(adev->name, "apbt%d", i);
714 }
715#endif
716
717 return;
718
719out_noapbt:
720 apbt_clear_mapping();
721 apb_timer_block_enabled = 0;
722 panic("failed to enable APB timer\n");
723}
724
725static inline void apbt_disable(int n)
726{
727 if (is_apbt_capable()) {
728 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
729 ctrl &= ~APBTMR_CONTROL_ENABLE;
730 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
731 }
732}
733
734/* called before apb_timer_enable, use early map */
735unsigned long apbt_quick_calibrate()
736{
737 int i, scale;
738 u64 old, new;
739 cycle_t t1, t2;
740 unsigned long khz = 0;
741 u32 loop, shift;
742
743 apbt_set_mapping();
744 apbt_start_counter(phy_cs_timer_id);
745
746 /* check if the timer can count down, otherwise return */
747 old = apbt_read_clocksource(&clocksource_apbt);
748 i = 10000;
749 while (--i) {
750 if (old != apbt_read_clocksource(&clocksource_apbt))
751 break;
752 }
753 if (!i)
754 goto failed;
755
756 /* count 16 ms */
757 loop = (apbt_freq * 1000) << 4;
758
759 /* restart the timer to ensure it won't get to 0 in the calibration */
760 apbt_start_counter(phy_cs_timer_id);
761
762 old = apbt_read_clocksource(&clocksource_apbt);
763 old += loop;
764
765 t1 = __native_read_tsc();
766
767 do {
768 new = apbt_read_clocksource(&clocksource_apbt);
769 } while (new < old);
770
771 t2 = __native_read_tsc();
772
773 shift = 5;
774 if (unlikely(loop >> shift == 0)) {
775 printk(KERN_INFO
776 "APBT TSC calibration failed, not enough resolution\n");
777 return 0;
778 }
779 scale = (int)div_u64((t2 - t1), loop >> shift);
780 khz = (scale * apbt_freq * 1000) >> shift;
781 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
782 return khz;
783failed:
784 return 0;
785}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3704997e8b25..b5d8b0bcf235 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -393,6 +393,7 @@ void __init gart_iommu_hole_init(void)
393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
394 int bus; 394 int bus;
395 int dev_base, dev_limit; 395 int dev_base, dev_limit;
396 u32 ctl;
396 397
397 bus = bus_dev_ranges[i].bus; 398 bus = bus_dev_ranges[i].bus;
398 dev_base = bus_dev_ranges[i].dev_base; 399 dev_base = bus_dev_ranges[i].dev_base;
@@ -406,7 +407,19 @@ void __init gart_iommu_hole_init(void)
406 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
407 x86_init.iommu.iommu_init = gart_iommu_init; 408 x86_init.iommu.iommu_init = gart_iommu_init;
408 409
409 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 410 ctl = read_pci_config(bus, slot, 3,
411 AMD64_GARTAPERTURECTL);
412
413 /*
414 * Before we do anything else disable the GART. It may
415 * still be enabled if we boot into a crash-kernel here.
416 * Reconfiguring the GART while it is enabled could have
417 * unknown side-effects.
418 */
419 ctl &= ~GARTEN;
420 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
421
422 aper_order = (ctl >> 1) & 7;
410 aper_size = (32 * 1024 * 1024) << aper_order; 423 aper_size = (32 * 1024 * 1024) << aper_order;
411 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 424 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
412 aper_base <<= 25; 425 aper_base <<= 25;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index aa57c079c98f..e5a4a1e01618 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
61 61
62/* 62/*
63 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
64 *
65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
66 * are in the 0 ... 7 range, then we can use logical addressing which
67 * has some performance advantages (better broadcasting).
68 *
69 * If there's an APIC ID above 8, we use physical addressing.
70 */ 64 */
71unsigned int max_physical_apicid; 65unsigned int max_physical_apicid;
72 66
@@ -587,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
587 res = (((u64)(*deltatsc)) * pm_100ms); 581 res = (((u64)(*deltatsc)) * pm_100ms);
588 do_div(res, deltapm); 582 do_div(res, deltapm);
589 apic_printk(APIC_VERBOSE, "TSC delta adjusted to " 583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
590 "PM-Timer: %lu (%ld) \n", 584 "PM-Timer: %lu (%ld)\n",
591 (unsigned long)res, *deltatsc); 585 (unsigned long)res, *deltatsc);
592 *deltatsc = (long)res; 586 *deltatsc = (long)res;
593 } 587 }
@@ -1396,7 +1390,7 @@ void __init enable_IR_x2apic(void)
1396 } 1390 }
1397 1391
1398 local_irq_save(flags); 1392 local_irq_save(flags);
1399 mask_8259A(); 1393 legacy_pic->mask_all();
1400 mask_IO_APIC_setup(ioapic_entries); 1394 mask_IO_APIC_setup(ioapic_entries);
1401 1395
1402 if (dmar_table_init_ret) 1396 if (dmar_table_init_ret)
@@ -1428,7 +1422,7 @@ void __init enable_IR_x2apic(void)
1428nox2apic: 1422nox2apic:
1429 if (!ret) /* IR enabling failed */ 1423 if (!ret) /* IR enabling failed */
1430 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1431 unmask_8259A(); 1425 legacy_pic->restore_mask();
1432 local_irq_restore(flags); 1426 local_irq_restore(flags);
1433 1427
1434out: 1428out:
@@ -1646,8 +1640,8 @@ int __init APIC_init_uniprocessor(void)
1646 } 1640 }
1647#endif 1641#endif
1648 1642
1643#ifndef CONFIG_SMP
1649 enable_IR_x2apic(); 1644 enable_IR_x2apic();
1650#ifdef CONFIG_X86_64
1651 default_setup_apic_routing(); 1645 default_setup_apic_routing();
1652#endif 1646#endif
1653 1647
@@ -1897,18 +1891,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1897 if (apicid > max_physical_apicid) 1891 if (apicid > max_physical_apicid)
1898 max_physical_apicid = apicid; 1892 max_physical_apicid = apicid;
1899 1893
1900#ifdef CONFIG_X86_32
1901 switch (boot_cpu_data.x86_vendor) {
1902 case X86_VENDOR_INTEL:
1903 if (num_processors > 8)
1904 def_to_bigsmp = 1;
1905 break;
1906 case X86_VENDOR_AMD:
1907 if (max_physical_apicid >= 8)
1908 def_to_bigsmp = 1;
1909 }
1910#endif
1911
1912#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) 1894#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1913 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1895 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1914 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1896 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
@@ -2038,7 +2020,7 @@ static int lapic_resume(struct sys_device *dev)
2038 } 2020 }
2039 2021
2040 mask_IO_APIC_setup(ioapic_entries); 2022 mask_IO_APIC_setup(ioapic_entries);
2041 mask_8259A(); 2023 legacy_pic->mask_all();
2042 } 2024 }
2043 2025
2044 if (x2apic_mode) 2026 if (x2apic_mode)
@@ -2082,7 +2064,7 @@ static int lapic_resume(struct sys_device *dev)
2082 2064
2083 if (intr_remapping_enabled) { 2065 if (intr_remapping_enabled) {
2084 reenable_intr_remapping(x2apic_mode); 2066 reenable_intr_remapping(x2apic_mode);
2085 unmask_8259A(); 2067 legacy_pic->restore_mask();
2086 restore_IO_APIC_setup(ioapic_entries); 2068 restore_IO_APIC_setup(ioapic_entries);
2087 free_ioapic_entries(ioapic_entries); 2069 free_ioapic_entries(ioapic_entries);
2088 } 2070 }
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index eacbd2b31d27..09d3b17ce0c2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
223}; 223};
224 224
225/* 225/*
226 * Physflat mode is used when there are more than 8 CPUs on a AMD system. 226 * Physflat mode is used when there are more than 8 CPUs on a system.
227 * We cannot use logical delivery in this case because the mask 227 * We cannot use logical delivery in this case because the mask
228 * overflows, so use physical mode. 228 * overflows, so use physical mode.
229 */ 229 */
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
242 } 242 }
243
244 if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
245 printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
246 return 1;
247 }
243#endif 248#endif
244 249
245 return 0; 250 return 0;
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index dd2b5f264643..03ba1b895f5e 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -42,6 +42,7 @@
42#include <linux/errno.h> 42#include <linux/errno.h>
43#include <linux/acpi.h> 43#include <linux/acpi.h>
44#include <linux/init.h> 44#include <linux/init.h>
45#include <linux/gfp.h>
45#include <linux/nmi.h> 46#include <linux/nmi.h>
46#include <linux/smp.h> 47#include <linux/smp.h>
47#include <linux/io.h> 48#include <linux/io.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index de00c4619a55..127b8718abfb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -36,6 +36,7 @@
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */ 38#include <linux/jiffies.h> /* time_after() */
39#include <linux/slab.h>
39#ifdef CONFIG_ACPI 40#ifdef CONFIG_ACPI
40#include <acpi/acpi_bus.h> 41#include <acpi/acpi_bus.h>
41#endif 42#endif
@@ -73,8 +74,8 @@
73 */ 74 */
74int sis_apic_bug = -1; 75int sis_apic_bug = -1;
75 76
76static DEFINE_SPINLOCK(ioapic_lock); 77static DEFINE_RAW_SPINLOCK(ioapic_lock);
77static DEFINE_SPINLOCK(vector_lock); 78static DEFINE_RAW_SPINLOCK(vector_lock);
78 79
79/* 80/*
80 * # of IRQ routing registers 81 * # of IRQ routing registers
@@ -94,8 +95,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94/* # of MP IRQ source entries */ 95/* # of MP IRQ source entries */
95int mp_irq_entries; 96int mp_irq_entries;
96 97
97/* Number of legacy interrupts */
98static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
99/* GSI interrupts */ 98/* GSI interrupts */
100static int nr_irqs_gsi = NR_IRQS_LEGACY; 99static int nr_irqs_gsi = NR_IRQS_LEGACY;
101 100
@@ -140,33 +139,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 139
141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 140/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
142#ifdef CONFIG_SPARSE_IRQ 141#ifdef CONFIG_SPARSE_IRQ
143static struct irq_cfg irq_cfgx[] = { 142static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
144#else 143#else
145static struct irq_cfg irq_cfgx[NR_IRQS] = { 144static struct irq_cfg irq_cfgx[NR_IRQS];
146#endif 145#endif
147 [0] = { .vector = IRQ0_VECTOR, },
148 [1] = { .vector = IRQ1_VECTOR, },
149 [2] = { .vector = IRQ2_VECTOR, },
150 [3] = { .vector = IRQ3_VECTOR, },
151 [4] = { .vector = IRQ4_VECTOR, },
152 [5] = { .vector = IRQ5_VECTOR, },
153 [6] = { .vector = IRQ6_VECTOR, },
154 [7] = { .vector = IRQ7_VECTOR, },
155 [8] = { .vector = IRQ8_VECTOR, },
156 [9] = { .vector = IRQ9_VECTOR, },
157 [10] = { .vector = IRQ10_VECTOR, },
158 [11] = { .vector = IRQ11_VECTOR, },
159 [12] = { .vector = IRQ12_VECTOR, },
160 [13] = { .vector = IRQ13_VECTOR, },
161 [14] = { .vector = IRQ14_VECTOR, },
162 [15] = { .vector = IRQ15_VECTOR, },
163};
164
165void __init io_apic_disable_legacy(void)
166{
167 nr_legacy_irqs = 0;
168 nr_irqs_gsi = 0;
169}
170 146
171int __init arch_early_irq_init(void) 147int __init arch_early_irq_init(void)
172{ 148{
@@ -176,6 +152,11 @@ int __init arch_early_irq_init(void)
176 int node; 152 int node;
177 int i; 153 int i;
178 154
155 if (!legacy_pic->nr_legacy_irqs) {
156 nr_irqs_gsi = 0;
157 io_apic_irqs = ~0UL;
158 }
159
179 cfg = irq_cfgx; 160 cfg = irq_cfgx;
180 count = ARRAY_SIZE(irq_cfgx); 161 count = ARRAY_SIZE(irq_cfgx);
181 node= cpu_to_node(boot_cpu_id); 162 node= cpu_to_node(boot_cpu_id);
@@ -185,8 +166,14 @@ int __init arch_early_irq_init(void)
185 desc->chip_data = &cfg[i]; 166 desc->chip_data = &cfg[i];
186 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 167 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
187 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 168 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
188 if (i < nr_legacy_irqs) 169 /*
189 cpumask_setall(cfg[i].domain); 170 * For legacy IRQ's, start with assigning irq0 to irq15 to
171 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
172 */
173 if (i < legacy_pic->nr_legacy_irqs) {
174 cfg[i].vector = IRQ0_VECTOR + i;
175 cpumask_set_cpu(0, cfg[i].domain);
176 }
190 } 177 }
191 178
192 return 0; 179 return 0;
@@ -406,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
406 struct irq_pin_list *entry; 393 struct irq_pin_list *entry;
407 unsigned long flags; 394 unsigned long flags;
408 395
409 spin_lock_irqsave(&ioapic_lock, flags); 396 raw_spin_lock_irqsave(&ioapic_lock, flags);
410 for_each_irq_pin(entry, cfg->irq_2_pin) { 397 for_each_irq_pin(entry, cfg->irq_2_pin) {
411 unsigned int reg; 398 unsigned int reg;
412 int pin; 399 int pin;
@@ -415,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
415 reg = io_apic_read(entry->apic, 0x10 + pin*2); 402 reg = io_apic_read(entry->apic, 0x10 + pin*2);
416 /* Is the remote IRR bit set? */ 403 /* Is the remote IRR bit set? */
417 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 404 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
418 spin_unlock_irqrestore(&ioapic_lock, flags); 405 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
419 return true; 406 return true;
420 } 407 }
421 } 408 }
422 spin_unlock_irqrestore(&ioapic_lock, flags); 409 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
423 410
424 return false; 411 return false;
425} 412}
@@ -433,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
433{ 420{
434 union entry_union eu; 421 union entry_union eu;
435 unsigned long flags; 422 unsigned long flags;
436 spin_lock_irqsave(&ioapic_lock, flags); 423 raw_spin_lock_irqsave(&ioapic_lock, flags);
437 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 424 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
438 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 425 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
439 spin_unlock_irqrestore(&ioapic_lock, flags); 426 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
440 return eu.entry; 427 return eu.entry;
441} 428}
442 429
@@ -459,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
459void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 446void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
460{ 447{
461 unsigned long flags; 448 unsigned long flags;
462 spin_lock_irqsave(&ioapic_lock, flags); 449 raw_spin_lock_irqsave(&ioapic_lock, flags);
463 __ioapic_write_entry(apic, pin, e); 450 __ioapic_write_entry(apic, pin, e);
464 spin_unlock_irqrestore(&ioapic_lock, flags); 451 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
465} 452}
466 453
467/* 454/*
@@ -474,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin)
474 unsigned long flags; 461 unsigned long flags;
475 union entry_union eu = { .entry.mask = 1 }; 462 union entry_union eu = { .entry.mask = 1 };
476 463
477 spin_lock_irqsave(&ioapic_lock, flags); 464 raw_spin_lock_irqsave(&ioapic_lock, flags);
478 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 465 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
479 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 466 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
480 spin_unlock_irqrestore(&ioapic_lock, flags); 467 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
481} 468}
482 469
483/* 470/*
@@ -604,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
604 591
605 BUG_ON(!cfg); 592 BUG_ON(!cfg);
606 593
607 spin_lock_irqsave(&ioapic_lock, flags); 594 raw_spin_lock_irqsave(&ioapic_lock, flags);
608 __mask_IO_APIC_irq(cfg); 595 __mask_IO_APIC_irq(cfg);
609 spin_unlock_irqrestore(&ioapic_lock, flags); 596 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
610} 597}
611 598
612static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 599static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -614,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
614 struct irq_cfg *cfg = desc->chip_data; 601 struct irq_cfg *cfg = desc->chip_data;
615 unsigned long flags; 602 unsigned long flags;
616 603
617 spin_lock_irqsave(&ioapic_lock, flags); 604 raw_spin_lock_irqsave(&ioapic_lock, flags);
618 __unmask_IO_APIC_irq(cfg); 605 __unmask_IO_APIC_irq(cfg);
619 spin_unlock_irqrestore(&ioapic_lock, flags); 606 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
620} 607}
621 608
622static void mask_IO_APIC_irq(unsigned int irq) 609static void mask_IO_APIC_irq(unsigned int irq)
@@ -865,7 +852,7 @@ static int __init find_isa_irq_apic(int irq, int type)
865 */ 852 */
866static int EISA_ELCR(unsigned int irq) 853static int EISA_ELCR(unsigned int irq)
867{ 854{
868 if (irq < nr_legacy_irqs) { 855 if (irq < legacy_pic->nr_legacy_irqs) {
869 unsigned int port = 0x4d0 + (irq >> 3); 856 unsigned int port = 0x4d0 + (irq >> 3);
870 return (inb(port) >> (irq & 7)) & 1; 857 return (inb(port) >> (irq & 7)) & 1;
871 } 858 }
@@ -1140,12 +1127,12 @@ void lock_vector_lock(void)
1140 /* Used to the online set of cpus does not change 1127 /* Used to the online set of cpus does not change
1141 * during assign_irq_vector. 1128 * during assign_irq_vector.
1142 */ 1129 */
1143 spin_lock(&vector_lock); 1130 raw_spin_lock(&vector_lock);
1144} 1131}
1145 1132
1146void unlock_vector_lock(void) 1133void unlock_vector_lock(void)
1147{ 1134{
1148 spin_unlock(&vector_lock); 1135 raw_spin_unlock(&vector_lock);
1149} 1136}
1150 1137
1151static int 1138static int
@@ -1162,7 +1149,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1162 * Also, we've got to be careful not to trash gate 1149 * Also, we've got to be careful not to trash gate
1163 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1150 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1164 */ 1151 */
1165 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1152 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1153 static int current_offset = VECTOR_OFFSET_START % 8;
1166 unsigned int old_vector; 1154 unsigned int old_vector;
1167 int cpu, err; 1155 int cpu, err;
1168 cpumask_var_t tmp_mask; 1156 cpumask_var_t tmp_mask;
@@ -1198,7 +1186,7 @@ next:
1198 if (vector >= first_system_vector) { 1186 if (vector >= first_system_vector) {
1199 /* If out of vectors on large boxen, must share them. */ 1187 /* If out of vectors on large boxen, must share them. */
1200 offset = (offset + 1) % 8; 1188 offset = (offset + 1) % 8;
1201 vector = FIRST_DEVICE_VECTOR + offset; 1189 vector = FIRST_EXTERNAL_VECTOR + offset;
1202 } 1190 }
1203 if (unlikely(current_vector == vector)) 1191 if (unlikely(current_vector == vector))
1204 continue; 1192 continue;
@@ -1232,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1232 int err; 1220 int err;
1233 unsigned long flags; 1221 unsigned long flags;
1234 1222
1235 spin_lock_irqsave(&vector_lock, flags); 1223 raw_spin_lock_irqsave(&vector_lock, flags);
1236 err = __assign_irq_vector(irq, cfg, mask); 1224 err = __assign_irq_vector(irq, cfg, mask);
1237 spin_unlock_irqrestore(&vector_lock, flags); 1225 raw_spin_unlock_irqrestore(&vector_lock, flags);
1238 return err; 1226 return err;
1239} 1227}
1240 1228
@@ -1268,14 +1256,27 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1268void __setup_vector_irq(int cpu) 1256void __setup_vector_irq(int cpu)
1269{ 1257{
1270 /* Initialize vector_irq on a new cpu */ 1258 /* Initialize vector_irq on a new cpu */
1271 /* This function must be called with vector_lock held */
1272 int irq, vector; 1259 int irq, vector;
1273 struct irq_cfg *cfg; 1260 struct irq_cfg *cfg;
1274 struct irq_desc *desc; 1261 struct irq_desc *desc;
1275 1262
1263 /*
1264 * vector_lock will make sure that we don't run into irq vector
1265 * assignments that might be happening on another cpu in parallel,
1266 * while we setup our initial vector to irq mappings.
1267 */
1268 raw_spin_lock(&vector_lock);
1276 /* Mark the inuse vectors */ 1269 /* Mark the inuse vectors */
1277 for_each_irq_desc(irq, desc) { 1270 for_each_irq_desc(irq, desc) {
1278 cfg = desc->chip_data; 1271 cfg = desc->chip_data;
1272
1273 /*
1274 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1275 * will be part of the irq_cfg's domain.
1276 */
1277 if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
1278 cpumask_set_cpu(cpu, cfg->domain);
1279
1279 if (!cpumask_test_cpu(cpu, cfg->domain)) 1280 if (!cpumask_test_cpu(cpu, cfg->domain))
1280 continue; 1281 continue;
1281 vector = cfg->vector; 1282 vector = cfg->vector;
@@ -1291,6 +1292,7 @@ void __setup_vector_irq(int cpu)
1291 if (!cpumask_test_cpu(cpu, cfg->domain)) 1292 if (!cpumask_test_cpu(cpu, cfg->domain))
1292 per_cpu(vector_irq, cpu)[vector] = -1; 1293 per_cpu(vector_irq, cpu)[vector] = -1;
1293 } 1294 }
1295 raw_spin_unlock(&vector_lock);
1294} 1296}
1295 1297
1296static struct irq_chip ioapic_chip; 1298static struct irq_chip ioapic_chip;
@@ -1440,6 +1442,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1440 1442
1441 cfg = desc->chip_data; 1443 cfg = desc->chip_data;
1442 1444
1445 /*
1446 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1447 * controllers like 8259. Now that IO-APIC can handle this irq, update
1448 * the cfg->domain.
1449 */
1450 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1451 apic->vector_allocation_domain(0, cfg->domain);
1452
1443 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1453 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1444 return; 1454 return;
1445 1455
@@ -1461,8 +1471,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1461 } 1471 }
1462 1472
1463 ioapic_register_intr(irq, desc, trigger); 1473 ioapic_register_intr(irq, desc, trigger);
1464 if (irq < nr_legacy_irqs) 1474 if (irq < legacy_pic->nr_legacy_irqs)
1465 disable_8259A_irq(irq); 1475 legacy_pic->chip->mask(irq);
1466 1476
1467 ioapic_write_entry(apic_id, pin, entry); 1477 ioapic_write_entry(apic_id, pin, entry);
1468} 1478}
@@ -1473,7 +1483,7 @@ static struct {
1473 1483
1474static void __init setup_IO_APIC_irqs(void) 1484static void __init setup_IO_APIC_irqs(void)
1475{ 1485{
1476 int apic_id = 0, pin, idx, irq; 1486 int apic_id, pin, idx, irq;
1477 int notcon = 0; 1487 int notcon = 0;
1478 struct irq_desc *desc; 1488 struct irq_desc *desc;
1479 struct irq_cfg *cfg; 1489 struct irq_cfg *cfg;
@@ -1481,14 +1491,7 @@ static void __init setup_IO_APIC_irqs(void)
1481 1491
1482 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1492 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1483 1493
1484#ifdef CONFIG_ACPI 1494 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1485 if (!acpi_disabled && acpi_ioapic) {
1486 apic_id = mp_find_ioapic(0);
1487 if (apic_id < 0)
1488 apic_id = 0;
1489 }
1490#endif
1491
1492 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1495 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1493 idx = find_irq_entry(apic_id, pin, mp_INT); 1496 idx = find_irq_entry(apic_id, pin, mp_INT);
1494 if (idx == -1) { 1497 if (idx == -1) {
@@ -1510,6 +1513,9 @@ static void __init setup_IO_APIC_irqs(void)
1510 1513
1511 irq = pin_2_irq(idx, apic_id, pin); 1514 irq = pin_2_irq(idx, apic_id, pin);
1512 1515
1516 if ((apic_id > 0) && (irq > 16))
1517 continue;
1518
1513 /* 1519 /*
1514 * Skip the timer IRQ if there's a quirk handler 1520 * Skip the timer IRQ if there's a quirk handler
1515 * installed and if it returns 1: 1521 * installed and if it returns 1:
@@ -1539,6 +1545,56 @@ static void __init setup_IO_APIC_irqs(void)
1539} 1545}
1540 1546
1541/* 1547/*
1548 * for the gsit that is not in first ioapic
1549 * but could not use acpi_register_gsi()
1550 * like some special sci in IBM x3330
1551 */
1552void setup_IO_APIC_irq_extra(u32 gsi)
1553{
1554 int apic_id = 0, pin, idx, irq;
1555 int node = cpu_to_node(boot_cpu_id);
1556 struct irq_desc *desc;
1557 struct irq_cfg *cfg;
1558
1559 /*
1560 * Convert 'gsi' to 'ioapic.pin'.
1561 */
1562 apic_id = mp_find_ioapic(gsi);
1563 if (apic_id < 0)
1564 return;
1565
1566 pin = mp_find_ioapic_pin(apic_id, gsi);
1567 idx = find_irq_entry(apic_id, pin, mp_INT);
1568 if (idx == -1)
1569 return;
1570
1571 irq = pin_2_irq(idx, apic_id, pin);
1572#ifdef CONFIG_SPARSE_IRQ
1573 desc = irq_to_desc(irq);
1574 if (desc)
1575 return;
1576#endif
1577 desc = irq_to_desc_alloc_node(irq, node);
1578 if (!desc) {
1579 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1580 return;
1581 }
1582
1583 cfg = desc->chip_data;
1584 add_pin_to_irq_node(cfg, node, apic_id, pin);
1585
1586 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1587 pr_debug("Pin %d-%d already programmed\n",
1588 mp_ioapics[apic_id].apicid, pin);
1589 return;
1590 }
1591 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1592
1593 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1594 irq_trigger(idx), irq_polarity(idx));
1595}
1596
1597/*
1542 * Set up the timer pin, possibly with the 8259A-master behind. 1598 * Set up the timer pin, possibly with the 8259A-master behind.
1543 */ 1599 */
1544static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, 1600static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1601,14 +1657,14 @@ __apicdebuginit(void) print_IO_APIC(void)
1601 1657
1602 for (apic = 0; apic < nr_ioapics; apic++) { 1658 for (apic = 0; apic < nr_ioapics; apic++) {
1603 1659
1604 spin_lock_irqsave(&ioapic_lock, flags); 1660 raw_spin_lock_irqsave(&ioapic_lock, flags);
1605 reg_00.raw = io_apic_read(apic, 0); 1661 reg_00.raw = io_apic_read(apic, 0);
1606 reg_01.raw = io_apic_read(apic, 1); 1662 reg_01.raw = io_apic_read(apic, 1);
1607 if (reg_01.bits.version >= 0x10) 1663 if (reg_01.bits.version >= 0x10)
1608 reg_02.raw = io_apic_read(apic, 2); 1664 reg_02.raw = io_apic_read(apic, 2);
1609 if (reg_01.bits.version >= 0x20) 1665 if (reg_01.bits.version >= 0x20)
1610 reg_03.raw = io_apic_read(apic, 3); 1666 reg_03.raw = io_apic_read(apic, 3);
1611 spin_unlock_irqrestore(&ioapic_lock, flags); 1667 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1612 1668
1613 printk("\n"); 1669 printk("\n");
1614 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1670 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1647,7 +1703,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1647 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1703 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1648 1704
1649 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1705 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1650 " Stat Dmod Deli Vect: \n"); 1706 " Stat Dmod Deli Vect:\n");
1651 1707
1652 for (i = 0; i <= reg_01.bits.entries; i++) { 1708 for (i = 0; i <= reg_01.bits.entries; i++) {
1653 struct IO_APIC_route_entry entry; 1709 struct IO_APIC_route_entry entry;
@@ -1825,12 +1881,12 @@ __apicdebuginit(void) print_PIC(void)
1825 unsigned int v; 1881 unsigned int v;
1826 unsigned long flags; 1882 unsigned long flags;
1827 1883
1828 if (!nr_legacy_irqs) 1884 if (!legacy_pic->nr_legacy_irqs)
1829 return; 1885 return;
1830 1886
1831 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1887 printk(KERN_DEBUG "\nprinting PIC contents\n");
1832 1888
1833 spin_lock_irqsave(&i8259A_lock, flags); 1889 raw_spin_lock_irqsave(&i8259A_lock, flags);
1834 1890
1835 v = inb(0xa1) << 8 | inb(0x21); 1891 v = inb(0xa1) << 8 | inb(0x21);
1836 printk(KERN_DEBUG "... PIC IMR: %04x\n", v); 1892 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1844,7 +1900,7 @@ __apicdebuginit(void) print_PIC(void)
1844 outb(0x0a,0xa0); 1900 outb(0x0a,0xa0);
1845 outb(0x0a,0x20); 1901 outb(0x0a,0x20);
1846 1902
1847 spin_unlock_irqrestore(&i8259A_lock, flags); 1903 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
1848 1904
1849 printk(KERN_DEBUG "... PIC ISR: %04x\n", v); 1905 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1850 1906
@@ -1903,13 +1959,13 @@ void __init enable_IO_APIC(void)
1903 * The number of IO-APIC IRQ registers (== #pins): 1959 * The number of IO-APIC IRQ registers (== #pins):
1904 */ 1960 */
1905 for (apic = 0; apic < nr_ioapics; apic++) { 1961 for (apic = 0; apic < nr_ioapics; apic++) {
1906 spin_lock_irqsave(&ioapic_lock, flags); 1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1907 reg_01.raw = io_apic_read(apic, 1); 1963 reg_01.raw = io_apic_read(apic, 1);
1908 spin_unlock_irqrestore(&ioapic_lock, flags); 1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1909 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1910 } 1966 }
1911 1967
1912 if (!nr_legacy_irqs) 1968 if (!legacy_pic->nr_legacy_irqs)
1913 return; 1969 return;
1914 1970
1915 for(apic = 0; apic < nr_ioapics; apic++) { 1971 for(apic = 0; apic < nr_ioapics; apic++) {
@@ -1966,7 +2022,7 @@ void disable_IO_APIC(void)
1966 */ 2022 */
1967 clear_IO_APIC(); 2023 clear_IO_APIC();
1968 2024
1969 if (!nr_legacy_irqs) 2025 if (!legacy_pic->nr_legacy_irqs)
1970 return; 2026 return;
1971 2027
1972 /* 2028 /*
@@ -2045,9 +2101,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2045 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 2101 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2046 2102
2047 /* Read the register 0 value */ 2103 /* Read the register 0 value */
2048 spin_lock_irqsave(&ioapic_lock, flags); 2104 raw_spin_lock_irqsave(&ioapic_lock, flags);
2049 reg_00.raw = io_apic_read(apic_id, 0); 2105 reg_00.raw = io_apic_read(apic_id, 0);
2050 spin_unlock_irqrestore(&ioapic_lock, flags); 2106 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2051 2107
2052 old_id = mp_ioapics[apic_id].apicid; 2108 old_id = mp_ioapics[apic_id].apicid;
2053 2109
@@ -2106,16 +2162,16 @@ void __init setup_ioapic_ids_from_mpc(void)
2106 mp_ioapics[apic_id].apicid); 2162 mp_ioapics[apic_id].apicid);
2107 2163
2108 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 2164 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2109 spin_lock_irqsave(&ioapic_lock, flags); 2165 raw_spin_lock_irqsave(&ioapic_lock, flags);
2110 io_apic_write(apic_id, 0, reg_00.raw); 2166 io_apic_write(apic_id, 0, reg_00.raw);
2111 spin_unlock_irqrestore(&ioapic_lock, flags); 2167 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2112 2168
2113 /* 2169 /*
2114 * Sanity check 2170 * Sanity check
2115 */ 2171 */
2116 spin_lock_irqsave(&ioapic_lock, flags); 2172 raw_spin_lock_irqsave(&ioapic_lock, flags);
2117 reg_00.raw = io_apic_read(apic_id, 0); 2173 reg_00.raw = io_apic_read(apic_id, 0);
2118 spin_unlock_irqrestore(&ioapic_lock, flags); 2174 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2119 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2175 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2120 printk("could not set ID!\n"); 2176 printk("could not set ID!\n");
2121 else 2177 else
@@ -2198,15 +2254,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2198 unsigned long flags; 2254 unsigned long flags;
2199 struct irq_cfg *cfg; 2255 struct irq_cfg *cfg;
2200 2256
2201 spin_lock_irqsave(&ioapic_lock, flags); 2257 raw_spin_lock_irqsave(&ioapic_lock, flags);
2202 if (irq < nr_legacy_irqs) { 2258 if (irq < legacy_pic->nr_legacy_irqs) {
2203 disable_8259A_irq(irq); 2259 legacy_pic->chip->mask(irq);
2204 if (i8259A_irq_pending(irq)) 2260 if (legacy_pic->irq_pending(irq))
2205 was_pending = 1; 2261 was_pending = 1;
2206 } 2262 }
2207 cfg = irq_cfg(irq); 2263 cfg = irq_cfg(irq);
2208 __unmask_IO_APIC_irq(cfg); 2264 __unmask_IO_APIC_irq(cfg);
2209 spin_unlock_irqrestore(&ioapic_lock, flags); 2265 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2210 2266
2211 return was_pending; 2267 return was_pending;
2212} 2268}
@@ -2217,9 +2273,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
2217 struct irq_cfg *cfg = irq_cfg(irq); 2273 struct irq_cfg *cfg = irq_cfg(irq);
2218 unsigned long flags; 2274 unsigned long flags;
2219 2275
2220 spin_lock_irqsave(&vector_lock, flags); 2276 raw_spin_lock_irqsave(&vector_lock, flags);
2221 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2277 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2222 spin_unlock_irqrestore(&vector_lock, flags); 2278 raw_spin_unlock_irqrestore(&vector_lock, flags);
2223 2279
2224 return 1; 2280 return 1;
2225} 2281}
@@ -2312,14 +2368,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2312 irq = desc->irq; 2368 irq = desc->irq;
2313 cfg = desc->chip_data; 2369 cfg = desc->chip_data;
2314 2370
2315 spin_lock_irqsave(&ioapic_lock, flags); 2371 raw_spin_lock_irqsave(&ioapic_lock, flags);
2316 ret = set_desc_affinity(desc, mask, &dest); 2372 ret = set_desc_affinity(desc, mask, &dest);
2317 if (!ret) { 2373 if (!ret) {
2318 /* Only the high 8 bits are valid. */ 2374 /* Only the high 8 bits are valid. */
2319 dest = SET_APIC_LOGICAL_ID(dest); 2375 dest = SET_APIC_LOGICAL_ID(dest);
2320 __target_IO_APIC_irq(irq, dest, cfg); 2376 __target_IO_APIC_irq(irq, dest, cfg);
2321 } 2377 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2378 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2379
2324 return ret; 2380 return ret;
2325} 2381}
@@ -2434,6 +2490,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2434 cfg = irq_cfg(irq); 2490 cfg = irq_cfg(irq);
2435 raw_spin_lock(&desc->lock); 2491 raw_spin_lock(&desc->lock);
2436 2492
2493 /*
2494 * Check if the irq migration is in progress. If so, we
2495 * haven't received the cleanup request yet for this irq.
2496 */
2497 if (cfg->move_in_progress)
2498 goto unlock;
2499
2437 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2500 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2438 goto unlock; 2501 goto unlock;
2439 2502
@@ -2547,9 +2610,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
2547 irq = desc->irq; 2610 irq = desc->irq;
2548 cfg = desc->chip_data; 2611 cfg = desc->chip_data;
2549 2612
2550 spin_lock_irqsave(&ioapic_lock, flags); 2613 raw_spin_lock_irqsave(&ioapic_lock, flags);
2551 __eoi_ioapic_irq(irq, cfg); 2614 __eoi_ioapic_irq(irq, cfg);
2552 spin_unlock_irqrestore(&ioapic_lock, flags); 2615 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2553} 2616}
2554 2617
2555static void ack_apic_level(unsigned int irq) 2618static void ack_apic_level(unsigned int irq)
@@ -2727,8 +2790,8 @@ static inline void init_IO_APIC_traps(void)
2727 * so default to an old-fashioned 8259 2790 * so default to an old-fashioned 8259
2728 * interrupt if we can.. 2791 * interrupt if we can..
2729 */ 2792 */
2730 if (irq < nr_legacy_irqs) 2793 if (irq < legacy_pic->nr_legacy_irqs)
2731 make_8259A_irq(irq); 2794 legacy_pic->make_irq(irq);
2732 else 2795 else
2733 /* Strange. Oh, well.. */ 2796 /* Strange. Oh, well.. */
2734 desc->chip = &no_irq_chip; 2797 desc->chip = &no_irq_chip;
@@ -2885,7 +2948,7 @@ static inline void __init check_timer(void)
2885 /* 2948 /*
2886 * get/set the timer IRQ vector: 2949 * get/set the timer IRQ vector:
2887 */ 2950 */
2888 disable_8259A_irq(0); 2951 legacy_pic->chip->mask(0);
2889 assign_irq_vector(0, cfg, apic->target_cpus()); 2952 assign_irq_vector(0, cfg, apic->target_cpus());
2890 2953
2891 /* 2954 /*
@@ -2898,7 +2961,7 @@ static inline void __init check_timer(void)
2898 * automatically. 2961 * automatically.
2899 */ 2962 */
2900 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2963 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2901 init_8259A(1); 2964 legacy_pic->init(1);
2902#ifdef CONFIG_X86_32 2965#ifdef CONFIG_X86_32
2903 { 2966 {
2904 unsigned int ver; 2967 unsigned int ver;
@@ -2957,7 +3020,7 @@ static inline void __init check_timer(void)
2957 if (timer_irq_works()) { 3020 if (timer_irq_works()) {
2958 if (nmi_watchdog == NMI_IO_APIC) { 3021 if (nmi_watchdog == NMI_IO_APIC) {
2959 setup_nmi(); 3022 setup_nmi();
2960 enable_8259A_irq(0); 3023 legacy_pic->chip->unmask(0);
2961 } 3024 }
2962 if (disable_timer_pin_1 > 0) 3025 if (disable_timer_pin_1 > 0)
2963 clear_IO_APIC_pin(0, pin1); 3026 clear_IO_APIC_pin(0, pin1);
@@ -2980,14 +3043,14 @@ static inline void __init check_timer(void)
2980 */ 3043 */
2981 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 3044 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
2982 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 3045 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2983 enable_8259A_irq(0); 3046 legacy_pic->chip->unmask(0);
2984 if (timer_irq_works()) { 3047 if (timer_irq_works()) {
2985 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 3048 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2986 timer_through_8259 = 1; 3049 timer_through_8259 = 1;
2987 if (nmi_watchdog == NMI_IO_APIC) { 3050 if (nmi_watchdog == NMI_IO_APIC) {
2988 disable_8259A_irq(0); 3051 legacy_pic->chip->mask(0);
2989 setup_nmi(); 3052 setup_nmi();
2990 enable_8259A_irq(0); 3053 legacy_pic->chip->unmask(0);
2991 } 3054 }
2992 goto out; 3055 goto out;
2993 } 3056 }
@@ -2995,7 +3058,7 @@ static inline void __init check_timer(void)
2995 * Cleanup, just in case ... 3058 * Cleanup, just in case ...
2996 */ 3059 */
2997 local_irq_disable(); 3060 local_irq_disable();
2998 disable_8259A_irq(0); 3061 legacy_pic->chip->mask(0);
2999 clear_IO_APIC_pin(apic2, pin2); 3062 clear_IO_APIC_pin(apic2, pin2);
3000 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 3063 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
3001 } 3064 }
@@ -3014,22 +3077,22 @@ static inline void __init check_timer(void)
3014 3077
3015 lapic_register_intr(0, desc); 3078 lapic_register_intr(0, desc);
3016 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 3079 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
3017 enable_8259A_irq(0); 3080 legacy_pic->chip->unmask(0);
3018 3081
3019 if (timer_irq_works()) { 3082 if (timer_irq_works()) {
3020 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3083 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3021 goto out; 3084 goto out;
3022 } 3085 }
3023 local_irq_disable(); 3086 local_irq_disable();
3024 disable_8259A_irq(0); 3087 legacy_pic->chip->mask(0);
3025 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3088 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3026 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3089 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
3027 3090
3028 apic_printk(APIC_QUIET, KERN_INFO 3091 apic_printk(APIC_QUIET, KERN_INFO
3029 "...trying to set up timer as ExtINT IRQ...\n"); 3092 "...trying to set up timer as ExtINT IRQ...\n");
3030 3093
3031 init_8259A(0); 3094 legacy_pic->init(0);
3032 make_8259A_irq(0); 3095 legacy_pic->make_irq(0);
3033 apic_write(APIC_LVT0, APIC_DM_EXTINT); 3096 apic_write(APIC_LVT0, APIC_DM_EXTINT);
3034 3097
3035 unlock_ExtINT_logic(); 3098 unlock_ExtINT_logic();
@@ -3071,7 +3134,7 @@ void __init setup_IO_APIC(void)
3071 /* 3134 /*
3072 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3135 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3073 */ 3136 */
3074 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; 3137 io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3075 3138
3076 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3139 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3077 /* 3140 /*
@@ -3082,7 +3145,7 @@ void __init setup_IO_APIC(void)
3082 sync_Arb_IDs(); 3145 sync_Arb_IDs();
3083 setup_IO_APIC_irqs(); 3146 setup_IO_APIC_irqs();
3084 init_IO_APIC_traps(); 3147 init_IO_APIC_traps();
3085 if (nr_legacy_irqs) 3148 if (legacy_pic->nr_legacy_irqs)
3086 check_timer(); 3149 check_timer();
3087} 3150}
3088 3151
@@ -3131,13 +3194,13 @@ static int ioapic_resume(struct sys_device *dev)
3131 data = container_of(dev, struct sysfs_ioapic_data, dev); 3194 data = container_of(dev, struct sysfs_ioapic_data, dev);
3132 entry = data->entry; 3195 entry = data->entry;
3133 3196
3134 spin_lock_irqsave(&ioapic_lock, flags); 3197 raw_spin_lock_irqsave(&ioapic_lock, flags);
3135 reg_00.raw = io_apic_read(dev->id, 0); 3198 reg_00.raw = io_apic_read(dev->id, 0);
3136 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 3199 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3137 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 3200 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3138 io_apic_write(dev->id, 0, reg_00.raw); 3201 io_apic_write(dev->id, 0, reg_00.raw);
3139 } 3202 }
3140 spin_unlock_irqrestore(&ioapic_lock, flags); 3203 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3141 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 3204 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
3142 ioapic_write_entry(dev->id, i, entry[i]); 3205 ioapic_write_entry(dev->id, i, entry[i]);
3143 3206
@@ -3200,7 +3263,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3200 if (irq_want < nr_irqs_gsi) 3263 if (irq_want < nr_irqs_gsi)
3201 irq_want = nr_irqs_gsi; 3264 irq_want = nr_irqs_gsi;
3202 3265
3203 spin_lock_irqsave(&vector_lock, flags); 3266 raw_spin_lock_irqsave(&vector_lock, flags);
3204 for (new = irq_want; new < nr_irqs; new++) { 3267 for (new = irq_want; new < nr_irqs; new++) {
3205 desc_new = irq_to_desc_alloc_node(new, node); 3268 desc_new = irq_to_desc_alloc_node(new, node);
3206 if (!desc_new) { 3269 if (!desc_new) {
@@ -3219,14 +3282,11 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3219 irq = new; 3282 irq = new;
3220 break; 3283 break;
3221 } 3284 }
3222 spin_unlock_irqrestore(&vector_lock, flags); 3285 raw_spin_unlock_irqrestore(&vector_lock, flags);
3286
3287 if (irq > 0)
3288 dynamic_irq_init_keep_chip_data(irq);
3223 3289
3224 if (irq > 0) {
3225 dynamic_irq_init(irq);
3226 /* restore it, in case dynamic_irq_init clear it */
3227 if (desc_new)
3228 desc_new->chip_data = cfg_new;
3229 }
3230 return irq; 3290 return irq;
3231} 3291}
3232 3292
@@ -3248,20 +3308,13 @@ int create_irq(void)
3248void destroy_irq(unsigned int irq) 3308void destroy_irq(unsigned int irq)
3249{ 3309{
3250 unsigned long flags; 3310 unsigned long flags;
3251 struct irq_cfg *cfg;
3252 struct irq_desc *desc;
3253 3311
3254 /* store it, in case dynamic_irq_cleanup clear it */ 3312 dynamic_irq_cleanup_keep_chip_data(irq);
3255 desc = irq_to_desc(irq);
3256 cfg = desc->chip_data;
3257 dynamic_irq_cleanup(irq);
3258 /* connect back irq_cfg */
3259 desc->chip_data = cfg;
3260 3313
3261 free_irte(irq); 3314 free_irte(irq);
3262 spin_lock_irqsave(&vector_lock, flags); 3315 raw_spin_lock_irqsave(&vector_lock, flags);
3263 __clear_irq_vector(irq, cfg); 3316 __clear_irq_vector(irq, get_irq_chip_data(irq));
3264 spin_unlock_irqrestore(&vector_lock, flags); 3317 raw_spin_unlock_irqrestore(&vector_lock, flags);
3265} 3318}
3266 3319
3267/* 3320/*
@@ -3798,9 +3851,9 @@ int __init io_apic_get_redir_entries (int ioapic)
3798 union IO_APIC_reg_01 reg_01; 3851 union IO_APIC_reg_01 reg_01;
3799 unsigned long flags; 3852 unsigned long flags;
3800 3853
3801 spin_lock_irqsave(&ioapic_lock, flags); 3854 raw_spin_lock_irqsave(&ioapic_lock, flags);
3802 reg_01.raw = io_apic_read(ioapic, 1); 3855 reg_01.raw = io_apic_read(ioapic, 1);
3803 spin_unlock_irqrestore(&ioapic_lock, flags); 3856 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3804 3857
3805 return reg_01.bits.entries; 3858 return reg_01.bits.entries;
3806} 3859}
@@ -3883,7 +3936,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3883 /* 3936 /*
3884 * IRQs < 16 are already in the irq_2_pin[] map 3937 * IRQs < 16 are already in the irq_2_pin[] map
3885 */ 3938 */
3886 if (irq >= nr_legacy_irqs) { 3939 if (irq >= legacy_pic->nr_legacy_irqs) {
3887 cfg = desc->chip_data; 3940 cfg = desc->chip_data;
3888 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3941 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3889 printk(KERN_INFO "can not add pin %d for irq %d\n", 3942 printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3962,9 +4015,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3962 if (physids_empty(apic_id_map)) 4015 if (physids_empty(apic_id_map))
3963 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); 4016 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3964 4017
3965 spin_lock_irqsave(&ioapic_lock, flags); 4018 raw_spin_lock_irqsave(&ioapic_lock, flags);
3966 reg_00.raw = io_apic_read(ioapic, 0); 4019 reg_00.raw = io_apic_read(ioapic, 0);
3967 spin_unlock_irqrestore(&ioapic_lock, flags); 4020 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3968 4021
3969 if (apic_id >= get_physical_broadcast()) { 4022 if (apic_id >= get_physical_broadcast()) {
3970 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " 4023 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -3998,10 +4051,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3998 if (reg_00.bits.ID != apic_id) { 4051 if (reg_00.bits.ID != apic_id) {
3999 reg_00.bits.ID = apic_id; 4052 reg_00.bits.ID = apic_id;
4000 4053
4001 spin_lock_irqsave(&ioapic_lock, flags); 4054 raw_spin_lock_irqsave(&ioapic_lock, flags);
4002 io_apic_write(ioapic, 0, reg_00.raw); 4055 io_apic_write(ioapic, 0, reg_00.raw);
4003 reg_00.raw = io_apic_read(ioapic, 0); 4056 reg_00.raw = io_apic_read(ioapic, 0);
4004 spin_unlock_irqrestore(&ioapic_lock, flags); 4057 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4005 4058
4006 /* Sanity check */ 4059 /* Sanity check */
4007 if (reg_00.bits.ID != apic_id) { 4060 if (reg_00.bits.ID != apic_id) {
@@ -4022,9 +4075,9 @@ int __init io_apic_get_version(int ioapic)
4022 union IO_APIC_reg_01 reg_01; 4075 union IO_APIC_reg_01 reg_01;
4023 unsigned long flags; 4076 unsigned long flags;
4024 4077
4025 spin_lock_irqsave(&ioapic_lock, flags); 4078 raw_spin_lock_irqsave(&ioapic_lock, flags);
4026 reg_01.raw = io_apic_read(ioapic, 1); 4079 reg_01.raw = io_apic_read(ioapic, 1);
4027 spin_unlock_irqrestore(&ioapic_lock, flags); 4080 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4028 4081
4029 return reg_01.bits.version; 4082 return reg_01.bits.version;
4030} 4083}
@@ -4056,27 +4109,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4056#ifdef CONFIG_SMP 4109#ifdef CONFIG_SMP
4057void __init setup_ioapic_dest(void) 4110void __init setup_ioapic_dest(void)
4058{ 4111{
4059 int pin, ioapic = 0, irq, irq_entry; 4112 int pin, ioapic, irq, irq_entry;
4060 struct irq_desc *desc; 4113 struct irq_desc *desc;
4061 const struct cpumask *mask; 4114 const struct cpumask *mask;
4062 4115
4063 if (skip_ioapic_setup == 1) 4116 if (skip_ioapic_setup == 1)
4064 return; 4117 return;
4065 4118
4066#ifdef CONFIG_ACPI 4119 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4067 if (!acpi_disabled && acpi_ioapic) {
4068 ioapic = mp_find_ioapic(0);
4069 if (ioapic < 0)
4070 ioapic = 0;
4071 }
4072#endif
4073
4074 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4120 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4075 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4121 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4076 if (irq_entry == -1) 4122 if (irq_entry == -1)
4077 continue; 4123 continue;
4078 irq = pin_2_irq(irq_entry, ioapic, pin); 4124 irq = pin_2_irq(irq_entry, ioapic, pin);
4079 4125
4126 if ((ioapic > 0) && (irq > 16))
4127 continue;
4128
4080 desc = irq_to_desc(irq); 4129 desc = irq_to_desc(irq);
4081 4130
4082 /* 4131 /*
@@ -4261,3 +4310,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4261 4310
4262 nr_ioapics++; 4311 nr_ioapics++;
4263} 4312}
4313
4314/* Enable IOAPIC early just for system timer */
4315void __init pre_init_apic_IRQ0(void)
4316{
4317 struct irq_cfg *cfg;
4318 struct irq_desc *desc;
4319
4320 printk(KERN_INFO "Early APIC setup for system timer0\n");
4321#ifndef CONFIG_SMP
4322 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
4323#endif
4324 desc = irq_to_desc_alloc_node(0, 0);
4325
4326 setup_local_APIC();
4327
4328 cfg = irq_cfg(0);
4329 add_pin_to_irq_node(cfg, 0, 0, 0);
4330 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
4331
4332 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4333}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396cb..1edaf15c0b8e 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/slab.h>
21#include <linux/sysdev.h> 22#include <linux/sysdev.h>
22#include <linux/sysctl.h> 23#include <linux/sysctl.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -177,7 +178,7 @@ int __init check_nmi_watchdog(void)
177error: 178error:
178 if (nmi_watchdog == NMI_IO_APIC) { 179 if (nmi_watchdog == NMI_IO_APIC) {
179 if (!timer_through_8259) 180 if (!timer_through_8259)
180 disable_8259A_irq(0); 181 legacy_pic->chip->mask(0);
181 on_each_cpu(__acpi_nmi_disable, NULL, 1); 182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
182 } 183 }
183 184
@@ -416,13 +417,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
416 417
417 /* We can be called before check_nmi_watchdog, hence NULL check. */ 418 /* We can be called before check_nmi_watchdog, hence NULL check. */
418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 419 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 420 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
420 421
421 spin_lock(&lock); 422 raw_spin_lock(&lock);
422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 423 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
423 show_regs(regs); 424 show_regs(regs);
424 dump_stack(); 425 dump_stack();
425 spin_unlock(&lock); 426 raw_spin_unlock(&lock);
426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 427 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
427 428
428 rc = 1; 429 rc = 1;
@@ -438,8 +439,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
438 * Ayiee, looks like this CPU is stuck ... 439 * Ayiee, looks like this CPU is stuck ...
439 * wait a few IRQs (5 seconds) before doing the oops ... 440 * wait a few IRQs (5 seconds) before doing the oops ...
440 */ 441 */
441 __this_cpu_inc(per_cpu_var(alert_counter)); 442 __this_cpu_inc(alert_counter);
442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) 443 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
443 /* 444 /*
444 * die_nmi will return ONLY if NOTIFY_STOP happens.. 445 * die_nmi will return ONLY if NOTIFY_STOP happens..
445 */ 446 */
@@ -447,7 +448,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
447 regs, panic_on_timeout); 448 regs, panic_on_timeout);
448 } else { 449 } else {
449 __get_cpu_var(last_irq_sum) = sum; 450 __get_cpu_var(last_irq_sum) = sum;
450 __this_cpu_write(per_cpu_var(alert_counter), 0); 451 __this_cpu_write(alert_counter, 0);
451 } 452 }
452 453
453 /* see if the nmi watchdog went off */ 454 /* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 98c4665f251c..3e28401f161c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
225 225
226 mpc_record = 0; 226 mpc_record = 0;
227 printk(KERN_INFO 227 printk(KERN_INFO
228 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); 228 "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
229 229
230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { 230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
231 printk(KERN_WARNING 231 printk(KERN_WARNING
@@ -277,6 +277,7 @@ static __init void early_check_numaq(void)
277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; 277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; 278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
279 x86_init.timers.tsc_pre_init = numaq_tsc_init; 279 x86_init.timers.tsc_pre_init = numaq_tsc_init;
280 x86_init.pci.init = pci_numaq_init;
280 } 281 }
281} 282}
282 283
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 1a6559f6768c..99d2fe016084 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,7 +52,32 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
56{
57 int version = apic_version[boot_cpu_physical_apicid];
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78}
79
80static void setup_apic_flat_routing(void)
56{ 81{
57#ifdef CONFIG_X86_IO_APIC 82#ifdef CONFIG_X86_IO_APIC
58 printk(KERN_INFO 83 printk(KERN_INFO
@@ -103,7 +128,7 @@ struct apic apic_default = {
103 .init_apic_ldr = default_init_apic_ldr, 128 .init_apic_ldr = default_init_apic_ldr,
104 129
105 .ioapic_phys_id_map = default_ioapic_phys_id_map, 130 .ioapic_phys_id_map = default_ioapic_phys_id_map,
106 .setup_apic_routing = default_setup_apic_routing, 131 .setup_apic_routing = setup_apic_flat_routing,
107 .multi_timer_check = NULL, 132 .multi_timer_check = NULL,
108 .apicid_to_node = default_apicid_to_node, 133 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1c..83e9be4778e2 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
67 } 67 }
68#endif 68#endif
69 69
70 if (apic == &apic_flat) { 70 if (apic == &apic_flat && num_possible_cpus() > 8)
71 switch (boot_cpu_data.x86_vendor) { 71 apic = &apic_physflat;
72 case X86_VENDOR_INTEL:
73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
80 }
81 72
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83 74
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d56b0efb2057..c085d52dbaf2 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -17,9 +17,12 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/timer.h> 19#include <linux/timer.h>
20#include <linux/slab.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h>
25#include <linux/kdebug.h>
23 26
24#include <asm/uv/uv_mmrs.h> 27#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h> 28#include <asm/uv/uv_hub.h>
@@ -34,8 +37,13 @@
34 37
35DEFINE_PER_CPU(int, x2apic_extra_bits); 38DEFINE_PER_CPU(int, x2apic_extra_bits);
36 39
40#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
41
37static enum uv_system_type uv_system_type; 42static enum uv_system_type uv_system_type;
38static u64 gru_start_paddr, gru_end_paddr; 43static u64 gru_start_paddr, gru_end_paddr;
44int uv_min_hub_revision_id;
45EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
46static DEFINE_SPINLOCK(uv_nmi_lock);
39 47
40static inline bool is_GRU_range(u64 start, u64 end) 48static inline bool is_GRU_range(u64 start, u64 end)
41{ 49{
@@ -55,20 +63,28 @@ static int early_get_nodeid(void)
55 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); 63 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
56 node_id.v = *mmr; 64 node_id.v = *mmr;
57 early_iounmap(mmr, sizeof(*mmr)); 65 early_iounmap(mmr, sizeof(*mmr));
66
67 /* Currently, all blades have same revision number */
68 uv_min_hub_revision_id = node_id.s.revision;
69
58 return node_id.s.node_id; 70 return node_id.s.node_id;
59} 71}
60 72
61static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 73static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
62{ 74{
75 int nodeid;
76
63 if (!strcmp(oem_id, "SGI")) { 77 if (!strcmp(oem_id, "SGI")) {
78 nodeid = early_get_nodeid();
64 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 79 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
80 x86_platform.nmi_init = uv_nmi_init;
65 if (!strcmp(oem_table_id, "UVL")) 81 if (!strcmp(oem_table_id, "UVL"))
66 uv_system_type = UV_LEGACY_APIC; 82 uv_system_type = UV_LEGACY_APIC;
67 else if (!strcmp(oem_table_id, "UVX")) 83 else if (!strcmp(oem_table_id, "UVX"))
68 uv_system_type = UV_X2APIC; 84 uv_system_type = UV_X2APIC;
69 else if (!strcmp(oem_table_id, "UVH")) { 85 else if (!strcmp(oem_table_id, "UVH")) {
70 __get_cpu_var(x2apic_extra_bits) = 86 __get_cpu_var(x2apic_extra_bits) =
71 early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); 87 nodeid << (UV_APIC_PNODE_SHIFT - 1);
72 uv_system_type = UV_NON_UNIQUE_APIC; 88 uv_system_type = UV_NON_UNIQUE_APIC;
73 return 1; 89 return 1;
74 } 90 }
@@ -105,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
105unsigned long sn_rtc_cycles_per_second; 121unsigned long sn_rtc_cycles_per_second;
106EXPORT_SYMBOL(sn_rtc_cycles_per_second); 122EXPORT_SYMBOL(sn_rtc_cycles_per_second);
107 123
108/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
109
110static const struct cpumask *uv_target_cpus(void) 124static const struct cpumask *uv_target_cpus(void)
111{ 125{
112 return cpumask_of(0); 126 return cpu_online_mask;
113} 127}
114 128
115static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) 129static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -374,13 +388,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
374 388
375enum map_type {map_wb, map_uc}; 389enum map_type {map_wb, map_uc};
376 390
377static __init void map_high(char *id, unsigned long base, int shift, 391static __init void map_high(char *id, unsigned long base, int pshift,
378 int max_pnode, enum map_type map_type) 392 int bshift, int max_pnode, enum map_type map_type)
379{ 393{
380 unsigned long bytes, paddr; 394 unsigned long bytes, paddr;
381 395
382 paddr = base << shift; 396 paddr = base << pshift;
383 bytes = (1UL << shift) * (max_pnode + 1); 397 bytes = (1UL << bshift) * (max_pnode + 1);
384 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 398 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
385 paddr + bytes); 399 paddr + bytes);
386 if (map_type == map_uc) 400 if (map_type == map_uc)
@@ -396,7 +410,7 @@ static __init void map_gru_high(int max_pnode)
396 410
397 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 411 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
398 if (gru.s.enable) { 412 if (gru.s.enable) {
399 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 413 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
400 gru_start_paddr = ((u64)gru.s.base << shift); 414 gru_start_paddr = ((u64)gru.s.base << shift);
401 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); 415 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
402 416
@@ -410,7 +424,7 @@ static __init void map_mmr_high(int max_pnode)
410 424
411 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 425 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
412 if (mmr.s.enable) 426 if (mmr.s.enable)
413 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); 427 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
414} 428}
415 429
416static __init void map_mmioh_high(int max_pnode) 430static __init void map_mmioh_high(int max_pnode)
@@ -420,7 +434,8 @@ static __init void map_mmioh_high(int max_pnode)
420 434
421 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 435 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
422 if (mmioh.s.enable) 436 if (mmioh.s.enable)
423 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 437 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
438 max_pnode, map_uc);
424} 439}
425 440
426static __init void map_low_mmrs(void) 441static __init void map_low_mmrs(void)
@@ -472,7 +487,7 @@ static void uv_heartbeat(unsigned long ignored)
472 487
473static void __cpuinit uv_heartbeat_enable(int cpu) 488static void __cpuinit uv_heartbeat_enable(int cpu)
474{ 489{
475 if (!uv_cpu_hub_info(cpu)->scir.enabled) { 490 while (!uv_cpu_hub_info(cpu)->scir.enabled) {
476 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; 491 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
477 492
478 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); 493 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -480,11 +495,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
480 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; 495 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
481 add_timer_on(timer, cpu); 496 add_timer_on(timer, cpu);
482 uv_cpu_hub_info(cpu)->scir.enabled = 1; 497 uv_cpu_hub_info(cpu)->scir.enabled = 1;
483 }
484 498
485 /* check boot cpu */ 499 /* also ensure that boot cpu is enabled */
486 if (!uv_cpu_hub_info(0)->scir.enabled) 500 cpu = 0;
487 uv_heartbeat_enable(0); 501 }
488} 502}
489 503
490#ifdef CONFIG_HOTPLUG_CPU 504#ifdef CONFIG_HOTPLUG_CPU
@@ -543,6 +557,30 @@ late_initcall(uv_init_heartbeat);
543 557
544#endif /* !CONFIG_HOTPLUG_CPU */ 558#endif /* !CONFIG_HOTPLUG_CPU */
545 559
560/* Direct Legacy VGA I/O traffic to designated IOH */
561int uv_set_vga_state(struct pci_dev *pdev, bool decode,
562 unsigned int command_bits, bool change_bridge)
563{
564 int domain, bus, rc;
565
566 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
567 pdev->devfn, decode, command_bits, change_bridge);
568
569 if (!change_bridge)
570 return 0;
571
572 if ((command_bits & PCI_COMMAND_IO) == 0)
573 return 0;
574
575 domain = pci_domain_nr(pdev->bus);
576 bus = pdev->bus->number;
577
578 rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
579 PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
580
581 return rc;
582}
583
546/* 584/*
547 * Called on each cpu to initialize the per_cpu UV data area. 585 * Called on each cpu to initialize the per_cpu UV data area.
548 * FIXME: hotplug not supported yet 586 * FIXME: hotplug not supported yet
@@ -559,6 +597,46 @@ void __cpuinit uv_cpu_init(void)
559 set_x2apic_extra_bits(uv_hub_info->pnode); 597 set_x2apic_extra_bits(uv_hub_info->pnode);
560} 598}
561 599
600/*
601 * When NMI is received, print a stack trace.
602 */
603int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{
605 if (reason != DIE_NMI_IPI)
606 return NOTIFY_OK;
607 /*
608 * Use a lock so only one cpu prints at a time
609 * to prevent intermixed output.
610 */
611 spin_lock(&uv_nmi_lock);
612 pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
613 dump_stack();
614 spin_unlock(&uv_nmi_lock);
615
616 return NOTIFY_STOP;
617}
618
619static struct notifier_block uv_dump_stack_nmi_nb = {
620 .notifier_call = uv_handle_nmi
621};
622
623void uv_register_nmi_notifier(void)
624{
625 if (register_die_notifier(&uv_dump_stack_nmi_nb))
626 printk(KERN_WARNING "UV NMI handler failed to register\n");
627}
628
629void uv_nmi_init(void)
630{
631 unsigned int value;
632
633 /*
634 * Unmask NMI on all cpus
635 */
636 value = apic_read(APIC_LVT1) | APIC_DM_NMI;
637 value &= ~APIC_LVT_MASKED;
638 apic_write(APIC_LVT1, value);
639}
562 640
563void __init uv_system_init(void) 641void __init uv_system_init(void)
564{ 642{
@@ -624,13 +702,15 @@ void __init uv_system_init(void)
624 } 702 }
625 703
626 uv_bios_init(); 704 uv_bios_init();
627 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 705 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
628 &sn_coherency_id, &sn_region_size); 706 &sn_region_size, &system_serial_number);
629 uv_rtc_init(); 707 uv_rtc_init();
630 708
631 for_each_present_cpu(cpu) { 709 for_each_present_cpu(cpu) {
710 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
711
632 nid = cpu_to_node(cpu); 712 nid = cpu_to_node(cpu);
633 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 713 pnode = uv_apicid_to_pnode(apicid);
634 blade = boot_pnode_to_blade(pnode); 714 blade = boot_pnode_to_blade(pnode);
635 lcpu = uv_blade_info[blade].nr_possible_cpus; 715 lcpu = uv_blade_info[blade].nr_possible_cpus;
636 uv_blade_info[blade].nr_possible_cpus++; 716 uv_blade_info[blade].nr_possible_cpus++;
@@ -651,15 +731,13 @@ void __init uv_system_init(void)
651 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 731 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
652 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 732 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
653 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 733 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
654 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 734 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
655 uv_node_to_blade[nid] = blade; 735 uv_node_to_blade[nid] = blade;
656 uv_cpu_to_blade[cpu] = blade; 736 uv_cpu_to_blade[cpu] = blade;
657 max_pnode = max(pnode, max_pnode); 737 max_pnode = max(pnode, max_pnode);
658 738
659 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 739 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
660 "lcpu %d, blade %d\n", 740 cpu, apicid, pnode, nid, lcpu, blade);
661 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
662 lcpu, blade);
663 } 741 }
664 742
665 /* Add blade/pnode info for nodes without cpus */ 743 /* Add blade/pnode info for nodes without cpus */
@@ -680,5 +758,9 @@ void __init uv_system_init(void)
680 758
681 uv_cpu_init(); 759 uv_cpu_init();
682 uv_scir_register_cpu_notifier(); 760 uv_scir_register_cpu_notifier();
761 uv_register_nmi_notifier();
683 proc_mkdir("sgi_uv", NULL); 762 proc_mkdir("sgi_uv", NULL);
763
764 /* register Legacy VGA I/O redirection handler */
765 pci_register_set_vga_state(uv_set_vga_state);
684} 766}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index b5b6b23bce53..031aa887b0eb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1992 apm_info.disabled = 1; 1992 apm_info.disabled = 1;
1993 printk(KERN_INFO "%s machine detected. " 1993 printk(KERN_INFO "%s machine detected. "
1994 "Disabling APM.\n", d->ident); 1994 "Disabling APM.\n", d->ident);
1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
1996 printk(KERN_INFO "download from support.intel.com \n"); 1996 printk(KERN_INFO "download from support.intel.com\n");
1997 } 1997 }
1998 return 0; 1998 return 0;
1999} 1999}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index b0206a211b09..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 * 17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 18 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson <rja@sgi.com>
20 */ 20 */
21 21
22#include <linux/efi.h> 22#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
32 struct uv_systab *tab = &uv_systab; 32 struct uv_systab *tab = &uv_systab;
33 s64 ret;
33 34
34 if (!tab->function) 35 if (!tab->function)
35 /* 36 /*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
37 */ 38 */
38 return BIOS_STATUS_UNIMPLEMENTED; 39 return BIOS_STATUS_UNIMPLEMENTED;
39 40
40 return efi_call6((void *)__va(tab->function), 41 ret = efi_call6((void *)__va(tab->function), (u64)which,
41 (u64)which, a1, a2, a3, a4, a5); 42 a1, a2, a3, a4, a5);
43 return ret;
42} 44}
45EXPORT_SYMBOL_GPL(uv_bios_call);
43 46
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, 47s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5) 48 u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
73EXPORT_SYMBOL_GPL(sn_coherency_id); 76EXPORT_SYMBOL_GPL(sn_coherency_id);
74long sn_region_size; 77long sn_region_size;
75EXPORT_SYMBOL_GPL(sn_region_size); 78EXPORT_SYMBOL_GPL(sn_region_size);
79long system_serial_number;
80EXPORT_SYMBOL_GPL(system_serial_number);
76int uv_type; 81int uv_type;
82EXPORT_SYMBOL_GPL(uv_type);
77 83
78 84
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, 85s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region) 86 long *region, long *ssn)
81{ 87{
82 s64 ret; 88 s64 ret;
83 u64 v0, v1; 89 u64 v0, v1;
@@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
97 *coher = part.coherence_id; 103 *coher = part.coherence_id;
98 if (region) 104 if (region)
99 *region = part.region_size; 105 *region = part.region_size;
106 if (ssn)
107 *ssn = v1;
100 return ret; 108 return ret;
101} 109}
110EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
102 111
103int 112int
104uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, 113uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
@@ -154,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
154} 163}
155EXPORT_SYMBOL_GPL(uv_bios_freq_base); 164EXPORT_SYMBOL_GPL(uv_bios_freq_base);
156 165
166/*
167 * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
168 * @decode: true to enable target, false to disable target
169 * @domain: PCI domain number
170 * @bus: PCI bus number
171 *
172 * Returns:
173 * 0: Success
174 * -EINVAL: Invalid domain or bus number
175 * -ENOSYS: Capability not available
176 * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
177 */
178int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
179{
180 return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
181 (u64)decode, (u64)domain, (u64)bus, 0, 0);
182}
183EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
184
157 185
158#ifdef CONFIG_EFI 186#ifdef CONFIG_EFI
159void uv_bios_init(void) 187void uv_bios_init(void)
@@ -185,4 +213,3 @@ void uv_bios_init(void)
185 213
186void uv_bios_init(void) { } 214void uv_bios_init(void) { }
187#endif 215#endif
188
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 30f25a75fe28..5de7f4c56971 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -5,7 +5,6 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/acpi.h> 9#include <linux/acpi.h>
11#include <asm/io.h> 10#include <asm/io.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 1d2cb383410e..c202b62f3671 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
23
24obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
25obj-$(CONFIG_CPU_SUP_AMD) += amd.o 23obj-$(CONFIG_CPU_SUP_AMD) += amd.o
26obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 24obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 468489b57aae..97ad79cdf688 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
35 { 0, 0, 0, 0 } 39 { 0, 0, 0, 0 }
36 }; 40 };
37 41
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index b368cd862997..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpud_arr);
34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], cpud_priv_arr);
35static DEFINE_PER_CPU(int, cpud_priv_count);
36
37static DEFINE_MUTEX(cpu_debug_lock);
38
39static struct dentry *cpu_debugfs_dir;
40
41static struct cpu_debug_base cpu_base[] = {
42 { "mc", CPU_MC, 0 },
43 { "monitor", CPU_MONITOR, 0 },
44 { "time", CPU_TIME, 0 },
45 { "pmc", CPU_PMC, 1 },
46 { "platform", CPU_PLATFORM, 0 },
47 { "apic", CPU_APIC, 0 },
48 { "poweron", CPU_POWERON, 0 },
49 { "control", CPU_CONTROL, 0 },
50 { "features", CPU_FEATURES, 0 },
51 { "lastbranch", CPU_LBRANCH, 0 },
52 { "bios", CPU_BIOS, 0 },
53 { "freq", CPU_FREQ, 0 },
54 { "mtrr", CPU_MTRR, 0 },
55 { "perf", CPU_PERF, 0 },
56 { "cache", CPU_CACHE, 0 },
57 { "sysenter", CPU_SYSENTER, 0 },
58 { "therm", CPU_THERM, 0 },
59 { "misc", CPU_MISC, 0 },
60 { "debug", CPU_DEBUG, 0 },
61 { "pat", CPU_PAT, 0 },
62 { "vmx", CPU_VMX, 0 },
63 { "call", CPU_CALL, 0 },
64 { "base", CPU_BASE, 0 },
65 { "ver", CPU_VER, 0 },
66 { "conf", CPU_CONF, 0 },
67 { "smm", CPU_SMM, 0 },
68 { "svm", CPU_SVM, 0 },
69 { "osvm", CPU_OSVM, 0 },
70 { "tss", CPU_TSS, 0 },
71 { "cr", CPU_CR, 0 },
72 { "dt", CPU_DT, 0 },
73 { "registers", CPU_REG_ALL, 0 },
74};
75
76static struct cpu_file_base cpu_file[] = {
77 { "index", CPU_REG_ALL, 0 },
78 { "value", CPU_REG_ALL, 1 },
79};
80
81/* CPU Registers Range */
82static struct cpu_debug_range cpu_reg_range[] = {
83 { 0x00000000, 0x00000001, CPU_MC, },
84 { 0x00000006, 0x00000007, CPU_MONITOR, },
85 { 0x00000010, 0x00000010, CPU_TIME, },
86 { 0x00000011, 0x00000013, CPU_PMC, },
87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
88 { 0x0000001B, 0x0000001B, CPU_APIC, },
89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
94 { 0x00000079, 0x00000079, CPU_BIOS, },
95 { 0x00000088, 0x0000008A, CPU_CACHE, },
96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
98 { 0x000000C1, 0x000000C4, CPU_PMC, },
99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
100 { 0x000000E7, 0x000000E8, CPU_PERF, },
101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
102
103 { 0x00000116, 0x0000011E, CPU_CACHE, },
104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
105 { 0x00000179, 0x0000017B, CPU_MC, },
106 { 0x00000186, 0x00000189, CPU_PMC, },
107 { 0x00000198, 0x00000199, CPU_PERF, },
108 { 0x0000019A, 0x0000019A, CPU_TIME, },
109 { 0x0000019B, 0x0000019D, CPU_THERM, },
110 { 0x000001A0, 0x000001A0, CPU_MISC, },
111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
115
116 { 0x00000200, 0x0000020F, CPU_MTRR, },
117 { 0x00000250, 0x00000250, CPU_MTRR, },
118 { 0x00000258, 0x00000259, CPU_MTRR, },
119 { 0x00000268, 0x0000026F, CPU_MTRR, },
120 { 0x00000277, 0x00000277, CPU_PAT, },
121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
122
123 { 0x00000300, 0x00000311, CPU_PMC, },
124 { 0x00000345, 0x00000345, CPU_PMC, },
125 { 0x00000360, 0x00000371, CPU_PMC, },
126 { 0x0000038D, 0x00000390, CPU_PMC, },
127 { 0x000003A0, 0x000003BE, CPU_PMC, },
128 { 0x000003C0, 0x000003CD, CPU_PMC, },
129 { 0x000003E0, 0x000003E1, CPU_PMC, },
130 { 0x000003F0, 0x000003F2, CPU_PMC, },
131
132 { 0x00000400, 0x00000417, CPU_MC, },
133 { 0x00000480, 0x0000048B, CPU_VMX, },
134
135 { 0x00000600, 0x00000600, CPU_DEBUG, },
136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
138
139 { 0x000107CC, 0x000107D3, CPU_PMC, },
140
141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
142 { 0xC0000081, 0xC0000084, CPU_CALL, },
143 { 0xC0000100, 0xC0000102, CPU_BASE, },
144 { 0xC0000103, 0xC0000103, CPU_TIME, },
145
146 { 0xC0010000, 0xC0010007, CPU_PMC, },
147 { 0xC0010010, 0xC0010010, CPU_CONF, },
148 { 0xC0010015, 0xC0010015, CPU_CONF, },
149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
151 { 0xC001001F, 0xC001001F, CPU_CONF, },
152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
153 { 0xC0010044, 0xC0010048, CPU_MC, },
154 { 0xC0010050, 0xC0010056, CPU_SMM, },
155 { 0xC0010058, 0xC0010058, CPU_CONF, },
156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
157 { 0xC0010061, 0xC0010068, CPU_SMM, },
158 { 0xC0010069, 0xC001006B, CPU_SMM, },
159 { 0xC0010070, 0xC0010071, CPU_SMM, },
160 { 0xC0010111, 0xC0010113, CPU_SMM, },
161 { 0xC0010114, 0xC0010118, CPU_SVM, },
162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
163 { 0xC0011022, 0xC0011023, CPU_CONF, },
164};
165
166static int is_typeflag_valid(unsigned cpu, unsigned flag)
167{
168 int i;
169
170 /* Standard Registers should be always valid */
171 if (flag >= CPU_TSS)
172 return 1;
173
174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
175 if (cpu_reg_range[i].flag == flag)
176 return 1;
177 }
178
179 /* Invalid */
180 return 0;
181}
182
183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
184 int index, unsigned flag)
185{
186 if (cpu_reg_range[index].flag == flag) {
187 *min = cpu_reg_range[index].min;
188 *max = cpu_reg_range[index].max;
189 } else
190 *max = 0;
191
192 return *max;
193}
194
195/* This function can also be called with seq = NULL for printk */
196static void print_cpu_data(struct seq_file *seq, unsigned type,
197 u32 low, u32 high)
198{
199 struct cpu_private *priv;
200 u64 val = high;
201
202 if (seq) {
203 priv = seq->private;
204 if (priv->file) {
205 val = (val << 32) | low;
206 seq_printf(seq, "0x%llx\n", val);
207 } else
208 seq_printf(seq, " %08x: %08x_%08x\n",
209 type, high, low);
210 } else
211 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
212}
213
214/* This function can also be called with seq = NULL for printk */
215static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
216{
217 unsigned msr, msr_min, msr_max;
218 struct cpu_private *priv;
219 u32 low, high;
220 int i;
221
222 if (seq) {
223 priv = seq->private;
224 if (priv->file) {
225 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
226 &low, &high))
227 print_cpu_data(seq, priv->reg, low, high);
228 return;
229 }
230 }
231
232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
234 continue;
235
236 for (msr = msr_min; msr <= msr_max; msr++) {
237 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
238 continue;
239 print_cpu_data(seq, msr, low, high);
240 }
241 }
242}
243
244static void print_tss(void *arg)
245{
246 struct pt_regs *regs = task_pt_regs(current);
247 struct seq_file *seq = arg;
248 unsigned int seg;
249
250 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
251 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
252 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
253 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
254
255 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
256 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
257 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
258 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
259
260#ifdef CONFIG_X86_64
261 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
262 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
263 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
264 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
265 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
266 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
267 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
268 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
269#endif
270
271 asm("movl %%cs,%0" : "=r" (seg));
272 seq_printf(seq, " CS\t: %04x\n", seg);
273 asm("movl %%ds,%0" : "=r" (seg));
274 seq_printf(seq, " DS\t: %04x\n", seg);
275 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
276 asm("movl %%es,%0" : "=r" (seg));
277 seq_printf(seq, " ES\t: %04x\n", seg);
278 asm("movl %%fs,%0" : "=r" (seg));
279 seq_printf(seq, " FS\t: %04x\n", seg);
280 asm("movl %%gs,%0" : "=r" (seg));
281 seq_printf(seq, " GS\t: %04x\n", seg);
282
283 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
284
285 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
286}
287
288static void print_cr(void *arg)
289{
290 struct seq_file *seq = arg;
291
292 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
293 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
294 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
295 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
296#ifdef CONFIG_X86_64
297 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
298#endif
299}
300
301static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
302{
303 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
304}
305
306static void print_dt(void *seq)
307{
308 struct desc_ptr dt;
309 unsigned long ldt;
310
311 /* IDT */
312 store_idt((struct desc_ptr *)&dt);
313 print_desc_ptr("IDT", seq, dt);
314
315 /* GDT */
316 store_gdt((struct desc_ptr *)&dt);
317 print_desc_ptr("GDT", seq, dt);
318
319 /* LDT */
320 store_ldt(ldt);
321 seq_printf(seq, " LDT\t: %016lx\n", ldt);
322
323 /* TR */
324 store_tr(ldt);
325 seq_printf(seq, " TR\t: %016lx\n", ldt);
326}
327
328static void print_dr(void *arg)
329{
330 struct seq_file *seq = arg;
331 unsigned long dr;
332 int i;
333
334 for (i = 0; i < 8; i++) {
335 /* Ignore db4, db5 */
336 if ((i == 4) || (i == 5))
337 continue;
338 get_debugreg(dr, i);
339 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
340 }
341
342 seq_printf(seq, "\n MSR\t:\n");
343}
344
345static void print_apic(void *arg)
346{
347 struct seq_file *seq = arg;
348
349#ifdef CONFIG_X86_LOCAL_APIC
350 seq_printf(seq, " LAPIC\t:\n");
351 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
352 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
353 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
354 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
355 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
356 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
357 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
358 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
359 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
360 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
361 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
362 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
363 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
364 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
365 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
366 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
367 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
368 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
386 seq_printf(seq, "\n MSR\t:\n");
387}
388
389static int cpu_seq_show(struct seq_file *seq, void *v)
390{
391 struct cpu_private *priv = seq->private;
392
393 if (priv == NULL)
394 return -EINVAL;
395
396 switch (cpu_base[priv->type].flag) {
397 case CPU_TSS:
398 smp_call_function_single(priv->cpu, print_tss, seq, 1);
399 break;
400 case CPU_CR:
401 smp_call_function_single(priv->cpu, print_cr, seq, 1);
402 break;
403 case CPU_DT:
404 smp_call_function_single(priv->cpu, print_dt, seq, 1);
405 break;
406 case CPU_DEBUG:
407 if (priv->file == CPU_INDEX_BIT)
408 smp_call_function_single(priv->cpu, print_dr, seq, 1);
409 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
410 break;
411 case CPU_APIC:
412 if (priv->file == CPU_INDEX_BIT)
413 smp_call_function_single(priv->cpu, print_apic, seq, 1);
414 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
415 break;
416
417 default:
418 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
419 break;
420 }
421 seq_printf(seq, "\n");
422
423 return 0;
424}
425
426static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
427{
428 if (*pos == 0) /* One time is enough ;-) */
429 return seq;
430
431 return NULL;
432}
433
434static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435{
436 (*pos)++;
437
438 return cpu_seq_start(seq, pos);
439}
440
441static void cpu_seq_stop(struct seq_file *seq, void *v)
442{
443}
444
445static const struct seq_operations cpu_seq_ops = {
446 .start = cpu_seq_start,
447 .next = cpu_seq_next,
448 .stop = cpu_seq_stop,
449 .show = cpu_seq_show,
450};
451
452static int cpu_seq_open(struct inode *inode, struct file *file)
453{
454 struct cpu_private *priv = inode->i_private;
455 struct seq_file *seq;
456 int err;
457
458 err = seq_open(file, &cpu_seq_ops);
459 if (!err) {
460 seq = file->private_data;
461 seq->private = priv;
462 }
463
464 return err;
465}
466
467static int write_msr(struct cpu_private *priv, u64 val)
468{
469 u32 low, high;
470
471 high = (val >> 32) & 0xffffffff;
472 low = val & 0xffffffff;
473
474 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
475 return 0;
476
477 return -EPERM;
478}
479
480static int write_cpu_register(struct cpu_private *priv, const char *buf)
481{
482 int ret = -EPERM;
483 u64 val;
484
485 ret = strict_strtoull(buf, 0, &val);
486 if (ret < 0)
487 return ret;
488
489 /* Supporting only MSRs */
490 if (priv->type < CPU_TSS_BIT)
491 return write_msr(priv, val);
492
493 return ret;
494}
495
496static ssize_t cpu_write(struct file *file, const char __user *ubuf,
497 size_t count, loff_t *off)
498{
499 struct seq_file *seq = file->private_data;
500 struct cpu_private *priv = seq->private;
501 char buf[19];
502
503 if ((priv == NULL) || (count >= sizeof(buf)))
504 return -EINVAL;
505
506 if (copy_from_user(&buf, ubuf, count))
507 return -EFAULT;
508
509 buf[count] = 0;
510
511 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
512 if (!write_cpu_register(priv, buf))
513 return count;
514
515 return -EACCES;
516}
517
518static const struct file_operations cpu_fops = {
519 .owner = THIS_MODULE,
520 .open = cpu_seq_open,
521 .read = seq_read,
522 .write = cpu_write,
523 .llseek = seq_lseek,
524 .release = seq_release,
525};
526
527static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
528 unsigned file, struct dentry *dentry)
529{
530 struct cpu_private *priv = NULL;
531
532 /* Already intialized */
533 if (file == CPU_INDEX_BIT)
534 if (per_cpu(cpud_arr[type].init, cpu))
535 return 0;
536
537 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
538 if (priv == NULL)
539 return -ENOMEM;
540
541 priv->cpu = cpu;
542 priv->type = type;
543 priv->reg = reg;
544 priv->file = file;
545 mutex_lock(&cpu_debug_lock);
546 per_cpu(cpud_priv_arr[type], cpu) = priv;
547 per_cpu(cpud_priv_count, cpu)++;
548 mutex_unlock(&cpu_debug_lock);
549
550 if (file)
551 debugfs_create_file(cpu_file[file].name, S_IRUGO,
552 dentry, (void *)priv, &cpu_fops);
553 else {
554 debugfs_create_file(cpu_base[type].name, S_IRUGO,
555 per_cpu(cpud_arr[type].dentry, cpu),
556 (void *)priv, &cpu_fops);
557 mutex_lock(&cpu_debug_lock);
558 per_cpu(cpud_arr[type].init, cpu) = 1;
559 mutex_unlock(&cpu_debug_lock);
560 }
561
562 return 0;
563}
564
565static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
566 struct dentry *dentry)
567{
568 unsigned file;
569 int err = 0;
570
571 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
572 err = cpu_create_file(cpu, type, reg, file, dentry);
573 if (err)
574 return err;
575 }
576
577 return err;
578}
579
580static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
581{
582 struct dentry *cpu_dentry = NULL;
583 unsigned reg, reg_min, reg_max;
584 int i, err = 0;
585 char reg_dir[12];
586 u32 low, high;
587
588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
590 cpu_base[type].flag))
591 continue;
592
593 for (reg = reg_min; reg <= reg_max; reg++) {
594 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
595 continue;
596
597 sprintf(reg_dir, "0x%x", reg);
598 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
599 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
600 if (err)
601 return err;
602 }
603 }
604
605 return err;
606}
607
608static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
609{
610 struct dentry *cpu_dentry = NULL;
611 unsigned type;
612 int err = 0;
613
614 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
615 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
616 continue;
617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
618 per_cpu(cpud_arr[type].dentry, cpu) = cpu_dentry;
619
620 if (type < CPU_TSS_BIT)
621 err = cpu_init_msr(cpu, type, cpu_dentry);
622 else
623 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
624 cpu_dentry);
625 if (err)
626 return err;
627 }
628
629 return err;
630}
631
632static int cpu_init_cpu(void)
633{
634 struct dentry *cpu_dentry = NULL;
635 struct cpuinfo_x86 *cpui;
636 char cpu_dir[12];
637 unsigned cpu;
638 int err = 0;
639
640 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
641 cpui = &cpu_data(cpu);
642 if (!cpu_has(cpui, X86_FEATURE_MSR))
643 continue;
644
645 sprintf(cpu_dir, "cpu%d", cpu);
646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
647 err = cpu_init_allreg(cpu, cpu_dentry);
648
649 pr_info("cpu%d(%d) debug files %d\n",
650 cpu, nr_cpu_ids, per_cpu(cpud_priv_count, cpu));
651 if (per_cpu(cpud_priv_count, cpu) > MAX_CPU_FILES) {
652 pr_err("Register files count %d exceeds limit %d\n",
653 per_cpu(cpud_priv_count, cpu), MAX_CPU_FILES);
654 per_cpu(cpud_priv_count, cpu) = MAX_CPU_FILES;
655 err = -ENFILE;
656 }
657 if (err)
658 return err;
659 }
660
661 return err;
662}
663
664static int __init cpu_debug_init(void)
665{
666 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
667
668 return cpu_init_cpu();
669}
670
671static void __exit cpu_debug_exit(void)
672{
673 int i, cpu;
674
675 if (cpu_debugfs_dir)
676 debugfs_remove_recursive(cpu_debugfs_dir);
677
678 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
679 for (i = 0; i < per_cpu(cpud_priv_count, cpu); i++)
680 kfree(per_cpu(cpud_priv_arr[i], cpu));
681}
682
683module_init(cpu_debug_init);
684module_exit(cpu_debug_exit);
685
686MODULE_AUTHOR("Jaswinder Singh Rajput");
687MODULE_DESCRIPTION("CPU Debug module");
688MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..1840c0a5170b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c80..459168083b77 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h>
36#include <trace/events/power.h> 37#include <trace/events/power.h>
37 38
38#include <linux/acpi.h> 39#include <linux/acpi.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 006b278b0d5d..c587db472a75 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -20,7 +20,6 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22 22
23#include <linux/slab.h>
24#include <linux/delay.h> 23#include <linux/delay.h>
25#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
26 25
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index ac27ec2264d5..16e3483be9e3 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -80,6 +80,7 @@
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h> 82#include <linux/errno.h>
83#include <linux/slab.h>
83 84
84#include <asm/processor-cyrix.h> 85#include <asm/processor-cyrix.h>
85 86
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index da5f70fcb766..e7b559d74c52 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -9,7 +9,6 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15 14
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 869615193720..7b8a8ba67b07 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -25,7 +25,6 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h>
29#include <linux/cpumask.h> 28#include <linux/cpumask.h>
30#include <linux/timex.h> 29#include <linux/timex.h>
31 30
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..ce7cde713e71
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,621 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle);
410 if (ret)
411 dprintk("probe: _OSC evaluation did not succeed\n");
412 /* Firmware's use of _OSC is optional */
413 ret = 0;
414 }
415
416 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
417 if (ACPI_FAILURE(status))
418 return -ENODEV;
419
420 out_obj = output.pointer;
421 if (out_obj->type != ACPI_TYPE_PACKAGE) {
422 ret = -ENODEV;
423 goto out_free;
424 }
425
426 member = &out_obj->package.elements[0];
427 if (member->type != ACPI_TYPE_BUFFER) {
428 ret = -ENODEV;
429 goto out_free;
430 }
431
432 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
433
434 dprintk("probe: mem_resource descriptor: 0x%x,"
435 " length: %d, space_id: %d, resource_usage: %d,"
436 " type_specific: %d, granularity: 0x%llx,"
437 " minimum: 0x%llx, maximum: 0x%llx,"
438 " translation_offset: 0x%llx, address_length: 0x%llx\n",
439 mem_resource->descriptor, mem_resource->length,
440 mem_resource->space_id, mem_resource->resource_usage,
441 mem_resource->type_specific, mem_resource->granularity,
442 mem_resource->minimum, mem_resource->maximum,
443 mem_resource->translation_offset,
444 mem_resource->address_length);
445
446 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
447 ret = -ENODEV;
448 goto out_free;
449 }
450
451 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
452 mem_resource->address_length);
453 if (pcch_virt_addr == NULL) {
454 dprintk("probe: could not map shared mem region\n");
455 goto out_free;
456 }
457 pcch_hdr = pcch_virt_addr;
458
459 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
460 dprintk("probe: PCCH header is at physical address: 0x%llx,"
461 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
462 " supported features: 0x%x, command field: 0x%x,"
463 " status field: 0x%x, nominal latency: %d us\n",
464 mem_resource->minimum, ioread32(&pcch_hdr->signature),
465 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
466 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
467 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
468 ioread32(&pcch_hdr->latency));
469
470 dprintk("probe: min time between commands: %d us,"
471 " max time between commands: %d us,"
472 " nominal CPU frequency: %d MHz,"
473 " minimum CPU frequency: %d MHz,"
474 " minimum CPU frequency without throttling: %d MHz\n",
475 ioread32(&pcch_hdr->minimum_time),
476 ioread32(&pcch_hdr->maximum_time),
477 ioread32(&pcch_hdr->nominal),
478 ioread32(&pcch_hdr->throttled_frequency),
479 ioread32(&pcch_hdr->minimum_frequency));
480
481 member = &out_obj->package.elements[1];
482 if (member->type != ACPI_TYPE_BUFFER) {
483 ret = -ENODEV;
484 goto pcch_free;
485 }
486
487 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
488
489 doorbell.space_id = reg_resource->space_id;
490 doorbell.bit_width = reg_resource->bit_width;
491 doorbell.bit_offset = reg_resource->bit_offset;
492 doorbell.access_width = 64;
493 doorbell.address = reg_resource->address;
494
495 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
496 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
497 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
498 doorbell.access_width, reg_resource->address);
499
500 member = &out_obj->package.elements[2];
501 if (member->type != ACPI_TYPE_INTEGER) {
502 ret = -ENODEV;
503 goto pcch_free;
504 }
505
506 doorbell_preserve = member->integer.value;
507
508 member = &out_obj->package.elements[3];
509 if (member->type != ACPI_TYPE_INTEGER) {
510 ret = -ENODEV;
511 goto pcch_free;
512 }
513
514 doorbell_write = member->integer.value;
515
516 dprintk("probe: doorbell_preserve: 0x%llx,"
517 " doorbell_write: 0x%llx\n",
518 doorbell_preserve, doorbell_write);
519
520 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
521 if (!pcc_cpu_info) {
522 ret = -ENOMEM;
523 goto pcch_free;
524 }
525
526 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
527 " limits: %d MHz, %d MHz\n", PCC_VERSION,
528 ioread32(&pcch_hdr->minimum_frequency),
529 ioread32(&pcch_hdr->nominal));
530 kfree(output.pointer);
531 return ret;
532pcch_free:
533 pcc_clear_mapping();
534out_free:
535 kfree(output.pointer);
536 return ret;
537}
538
539static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
540{
541 unsigned int cpu = policy->cpu;
542 unsigned int result = 0;
543
544 if (!pcch_virt_addr) {
545 result = -1;
546 goto pcch_null;
547 }
548
549 result = pcc_get_offset(cpu);
550 if (result) {
551 dprintk("init: PCCP evaluation failed\n");
552 goto free;
553 }
554
555 policy->max = policy->cpuinfo.max_freq =
556 ioread32(&pcch_hdr->nominal) * 1000;
557 policy->min = policy->cpuinfo.min_freq =
558 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu);
560
561 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min);
563
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result;
570}
571
572static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
573{
574 return 0;
575}
576
577static struct cpufreq_driver pcc_cpufreq_driver = {
578 .flags = CPUFREQ_CONST_LOOPS,
579 .get = pcc_get_freq,
580 .verify = pcc_cpufreq_verify,
581 .target = pcc_cpufreq_target,
582 .init = pcc_cpufreq_cpu_init,
583 .exit = pcc_cpufreq_cpu_exit,
584 .name = "pcc-cpufreq",
585 .owner = THIS_MODULE,
586};
587
588static int __init pcc_cpufreq_init(void)
589{
590 int ret;
591
592 if (acpi_disabled)
593 return 0;
594
595 ret = pcc_cpufreq_probe();
596 if (ret) {
597 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
598 return ret;
599 }
600
601 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
602
603 return ret;
604}
605
606static void __exit pcc_cpufreq_exit(void)
607{
608 cpufreq_unregister_driver(&pcc_cpufreq_driver);
609
610 pcc_clear_mapping();
611
612 free_percpu(pcc_cpu_info);
613}
614
615MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
616MODULE_VERSION(PCC_VERSION);
617MODULE_DESCRIPTION("Processor Clocking Control interface driver");
618MODULE_LICENSE("GPL");
619
620late_initcall(pcc_cpufreq_init);
621module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index cb01dac267d3..b3379d6a5c57 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -13,7 +13,6 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/ioport.h> 15#include <linux/ioport.h>
16#include <linux/slab.h>
17#include <linux/timex.h> 16#include <linux/timex.h>
18#include <linux/io.h> 17#include <linux/io.h>
19 18
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index f125e5c551c0..d360b56e9825 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 807 unsigned int index)
808{ 808{
809 acpi_integer control; 809 u64 control;
810 810
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 812 return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 824{
825 struct cpufreq_frequency_table *powernow_table; 825 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 826 int ret_val = -ENODEV;
827 acpi_integer control, status; 827 u64 control, status;
828 828
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 830 dprintk("register performance failed: bad ACPI data\n");
@@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 948 u32 fid;
949 u32 vid; 949 u32 vid;
950 u32 freq, index; 950 u32 freq, index;
951 acpi_integer status, control; 951 u64 status, control;
952 952
953 if (data->exttype) { 953 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 954 status = data->acpi_data.states[i].status;
@@ -1356,6 +1356,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1356 1356
1357 kfree(data->powernow_table); 1357 kfree(data->powernow_table);
1358 kfree(data); 1358 kfree(data);
1359 per_cpu(powernow_data, pol->cpu) = NULL;
1359 1360
1360 return 0; 1361 return 0;
1361} 1362}
@@ -1375,7 +1376,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1375 int err; 1376 int err;
1376 1377
1377 if (!data) 1378 if (!data)
1378 return -EINVAL; 1379 return 0;
1379 1380
1380 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1381 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1381 if (err) 1382 if (err)
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 8d672ef162ce..9b1ff37de46a 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> /* current */ 20#include <linux/sched.h> /* current */
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/gfp.h>
23 24
24#include <asm/msr.h> 25#include <asm/msr.h>
25#include <asm/processor.h> 26#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2ce8e0b5cc54..561758e95180 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
25#include <linux/pci.h> 25#include <linux/pci.h>
26#include <linux/slab.h>
27#include <linux/sched.h> 26#include <linux/sched.h>
28 27
29#include "speedstep-lib.h" 28#include "speedstep-lib.h"
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index ad0083abfa23..a94ec6be69fa 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -13,7 +13,6 @@
13#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/cpufreq.h> 15#include <linux/cpufreq.h>
16#include <linux/slab.h>
17 16
18#include <asm/msr.h> 17#include <asm/msr.h>
19#include <asm/tsc.h> 18#include <asm/tsc.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 04d73c114e49..8abd869baabf 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -17,7 +17,6 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/delay.h> 20#include <linux/delay.h>
22#include <linux/io.h> 21#include <linux/io.h>
23#include <asm/ist.h> 22#include <asm/ist.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 879666f4d871..7e1cca13af35 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 70 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 sched_clock_stable = 1; 73 if (!check_tsc_unstable())
74 sched_clock_stable = 1;
74 } 75 }
75 76
76 /* 77 /*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c2b722d5a722..b3eeb66c0a51 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21#include <asm/smp.h>
21 22
22#define LVL_1_INST 1 23#define LVL_1_INST 1
23#define LVL_1_DATA 2 24#define LVL_1_DATA 2
@@ -152,7 +153,8 @@ struct _cpuid4_info {
152 union _cpuid4_leaf_ebx ebx; 153 union _cpuid4_leaf_ebx ebx;
153 union _cpuid4_leaf_ecx ecx; 154 union _cpuid4_leaf_ecx ecx;
154 unsigned long size; 155 unsigned long size;
155 unsigned long can_disable; 156 bool can_disable;
157 unsigned int l3_indices;
156 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
157}; 159};
158 160
@@ -162,7 +164,8 @@ struct _cpuid4_info_regs {
162 union _cpuid4_leaf_ebx ebx; 164 union _cpuid4_leaf_ebx ebx;
163 union _cpuid4_leaf_ecx ecx; 165 union _cpuid4_leaf_ecx ecx;
164 unsigned long size; 166 unsigned long size;
165 unsigned long can_disable; 167 bool can_disable;
168 unsigned int l3_indices;
166}; 169};
167 170
168unsigned short num_cache_leaves; 171unsigned short num_cache_leaves;
@@ -292,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
292 (ebx->split.ways_of_associativity + 1) - 1; 295 (ebx->split.ways_of_associativity + 1) - 1;
293} 296}
294 297
298struct _cache_attr {
299 struct attribute attr;
300 ssize_t (*show)(struct _cpuid4_info *, char *);
301 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
302};
303
304#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void)
306{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0;
316
317 pci_read_config_dword(dev, 0x1C4, &val);
318
319 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13));
324
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326}
327
295static void __cpuinit 328static void __cpuinit
296amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
297{ 330{
@@ -301,12 +334,103 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
301 if (boot_cpu_data.x86 == 0x11) 334 if (boot_cpu_data.x86 == 0x11)
302 return; 335 return;
303 336
304 /* see erratum #382 */ 337 /* see errata #382 and #388 */
305 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) 338 if ((boot_cpu_data.x86 == 0x10) &&
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
306 return; 341 return;
307 342
308 this_leaf->can_disable = 1; 343 this_leaf->can_disable = true;
344 this_leaf->l3_indices = amd_calc_l3_indices();
345}
346
347static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
348 unsigned int index)
349{
350 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
351 int node = amd_get_nb_id(cpu);
352 struct pci_dev *dev = node_to_k8_nb_misc(node);
353 unsigned int reg = 0;
354
355 if (!this_leaf->can_disable)
356 return -EINVAL;
357
358 if (!dev)
359 return -EINVAL;
360
361 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
362 return sprintf(buf, "0x%08x\n", reg);
363}
364
365#define SHOW_CACHE_DISABLE(index) \
366static ssize_t \
367show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
368{ \
369 return show_cache_disable(this_leaf, buf, index); \
370}
371SHOW_CACHE_DISABLE(0)
372SHOW_CACHE_DISABLE(1)
373
374static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
375 const char *buf, size_t count, unsigned int index)
376{
377 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
378 int node = amd_get_nb_id(cpu);
379 struct pci_dev *dev = node_to_k8_nb_misc(node);
380 unsigned long val = 0;
381
382#define SUBCACHE_MASK (3UL << 20)
383#define SUBCACHE_INDEX 0xfff
384
385 if (!this_leaf->can_disable)
386 return -EINVAL;
387
388 if (!capable(CAP_SYS_ADMIN))
389 return -EPERM;
390
391 if (!dev)
392 return -EINVAL;
393
394 if (strict_strtoul(buf, 10, &val) < 0)
395 return -EINVAL;
396
397 /* do not allow writes outside of allowed bits */
398 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
399 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
400 return -EINVAL;
401
402 val |= BIT(30);
403 pci_write_config_dword(dev, 0x1BC + index * 4, val);
404 /*
405 * We need to WBINVD on a core on the node containing the L3 cache which
406 * indices we disable therefore a simple wbinvd() is not sufficient.
407 */
408 wbinvd_on_cpu(cpu);
409 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
410 return count;
411}
412
413#define STORE_CACHE_DISABLE(index) \
414static ssize_t \
415store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
416 const char *buf, size_t count) \
417{ \
418 return store_cache_disable(this_leaf, buf, count, index); \
309} 419}
420STORE_CACHE_DISABLE(0)
421STORE_CACHE_DISABLE(1)
422
423static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
424 show_cache_disable_0, store_cache_disable_0);
425static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
426 show_cache_disable_1, store_cache_disable_1);
427
428#else /* CONFIG_CPU_SUP_AMD */
429static void __cpuinit
430amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
431{
432};
433#endif /* CONFIG_CPU_SUP_AMD */
310 434
311static int 435static int
312__cpuinit cpuid4_cache_lookup_regs(int index, 436__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -713,82 +837,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
713#define to_object(k) container_of(k, struct _index_kobject, kobj) 837#define to_object(k) container_of(k, struct _index_kobject, kobj)
714#define to_attr(a) container_of(a, struct _cache_attr, attr) 838#define to_attr(a) container_of(a, struct _cache_attr, attr)
715 839
716static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
717 unsigned int index)
718{
719 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
720 int node = cpu_to_node(cpu);
721 struct pci_dev *dev = node_to_k8_nb_misc(node);
722 unsigned int reg = 0;
723
724 if (!this_leaf->can_disable)
725 return -EINVAL;
726
727 if (!dev)
728 return -EINVAL;
729
730 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
731 return sprintf(buf, "%x\n", reg);
732}
733
734#define SHOW_CACHE_DISABLE(index) \
735static ssize_t \
736show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
737{ \
738 return show_cache_disable(this_leaf, buf, index); \
739}
740SHOW_CACHE_DISABLE(0)
741SHOW_CACHE_DISABLE(1)
742
743static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
744 const char *buf, size_t count, unsigned int index)
745{
746 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
747 int node = cpu_to_node(cpu);
748 struct pci_dev *dev = node_to_k8_nb_misc(node);
749 unsigned long val = 0;
750 unsigned int scrubber = 0;
751
752 if (!this_leaf->can_disable)
753 return -EINVAL;
754
755 if (!capable(CAP_SYS_ADMIN))
756 return -EPERM;
757
758 if (!dev)
759 return -EINVAL;
760
761 if (strict_strtoul(buf, 10, &val) < 0)
762 return -EINVAL;
763
764 val |= 0xc0000000;
765
766 pci_read_config_dword(dev, 0x58, &scrubber);
767 scrubber &= ~0x1f000000;
768 pci_write_config_dword(dev, 0x58, scrubber);
769
770 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
771 wbinvd();
772 pci_write_config_dword(dev, 0x1BC + index * 4, val);
773 return count;
774}
775
776#define STORE_CACHE_DISABLE(index) \
777static ssize_t \
778store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
779 const char *buf, size_t count) \
780{ \
781 return store_cache_disable(this_leaf, buf, count, index); \
782}
783STORE_CACHE_DISABLE(0)
784STORE_CACHE_DISABLE(1)
785
786struct _cache_attr {
787 struct attribute attr;
788 ssize_t (*show)(struct _cpuid4_info *, char *);
789 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
790};
791
792#define define_one_ro(_name) \ 840#define define_one_ro(_name) \
793static struct _cache_attr _name = \ 841static struct _cache_attr _name = \
794 __ATTR(_name, 0444, show_##_name, NULL) 842 __ATTR(_name, 0444, show_##_name, NULL)
@@ -803,23 +851,28 @@ define_one_ro(size);
803define_one_ro(shared_cpu_map); 851define_one_ro(shared_cpu_map);
804define_one_ro(shared_cpu_list); 852define_one_ro(shared_cpu_list);
805 853
806static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 854#define DEFAULT_SYSFS_CACHE_ATTRS \
807 show_cache_disable_0, store_cache_disable_0); 855 &type.attr, \
808static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 856 &level.attr, \
809 show_cache_disable_1, store_cache_disable_1); 857 &coherency_line_size.attr, \
858 &physical_line_partition.attr, \
859 &ways_of_associativity.attr, \
860 &number_of_sets.attr, \
861 &size.attr, \
862 &shared_cpu_map.attr, \
863 &shared_cpu_list.attr
810 864
811static struct attribute *default_attrs[] = { 865static struct attribute *default_attrs[] = {
812 &type.attr, 866 DEFAULT_SYSFS_CACHE_ATTRS,
813 &level.attr, 867 NULL
814 &coherency_line_size.attr, 868};
815 &physical_line_partition.attr, 869
816 &ways_of_associativity.attr, 870static struct attribute *default_l3_attrs[] = {
817 &number_of_sets.attr, 871 DEFAULT_SYSFS_CACHE_ATTRS,
818 &size.attr, 872#ifdef CONFIG_CPU_SUP_AMD
819 &shared_cpu_map.attr,
820 &shared_cpu_list.attr,
821 &cache_disable_0.attr, 873 &cache_disable_0.attr,
822 &cache_disable_1.attr, 874 &cache_disable_1.attr,
875#endif
823 NULL 876 NULL
824}; 877};
825 878
@@ -850,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
850 return ret; 903 return ret;
851} 904}
852 905
853static struct sysfs_ops sysfs_ops = { 906static const struct sysfs_ops sysfs_ops = {
854 .show = show, 907 .show = show,
855 .store = store, 908 .store = store,
856}; 909};
@@ -910,6 +963,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
910 unsigned int cpu = sys_dev->id; 963 unsigned int cpu = sys_dev->id;
911 unsigned long i, j; 964 unsigned long i, j;
912 struct _index_kobject *this_object; 965 struct _index_kobject *this_object;
966 struct _cpuid4_info *this_leaf;
913 int retval; 967 int retval;
914 968
915 retval = cpuid4_cache_sysfs_init(cpu); 969 retval = cpuid4_cache_sysfs_init(cpu);
@@ -928,6 +982,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
928 this_object = INDEX_KOBJECT_PTR(cpu, i); 982 this_object = INDEX_KOBJECT_PTR(cpu, i);
929 this_object->cpu = cpu; 983 this_object->cpu = cpu;
930 this_object->index = i; 984 this_object->index = i;
985
986 this_leaf = CPUID4_INFO_IDX(cpu, i);
987
988 if (this_leaf->can_disable)
989 ktype_cache.default_attrs = default_l3_attrs;
990 else
991 ktype_cache.default_attrs = default_attrs;
992
931 retval = kobject_init_and_add(&(this_object->kobj), 993 retval = kobject_init_and_add(&(this_object->kobj),
932 &ktype_cache, 994 &ktype_cache,
933 per_cpu(ici_cache_kobject, cpu), 995 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 73734baa50f2..e7dbde7bfedb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -22,6 +22,7 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/gfp.h>
25#include <asm/mce.h> 26#include <asm/mce.h>
26#include <asm/apic.h> 27#include <asm/apic.h>
27 28
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a8aacd4b513c..7a355ddcc64b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/sysfs.h> 27#include <linux/sysfs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
29#include <linux/init.h> 30#include <linux/init.h>
30#include <linux/kmod.h> 31#include <linux/kmod.h>
31#include <linux/poll.h> 32#include <linux/poll.h>
@@ -46,6 +47,13 @@
46 47
47#include "mce-internal.h" 48#include "mce-internal.h"
48 49
50static DEFINE_MUTEX(mce_read_mutex);
51
52#define rcu_dereference_check_mce(p) \
53 rcu_dereference_check((p), \
54 rcu_read_lock_sched_held() || \
55 lockdep_is_held(&mce_read_mutex))
56
49#define CREATE_TRACE_POINTS 57#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h> 58#include <trace/events/mce.h>
51 59
@@ -158,7 +166,7 @@ void mce_log(struct mce *mce)
158 mce->finished = 0; 166 mce->finished = 0;
159 wmb(); 167 wmb();
160 for (;;) { 168 for (;;) {
161 entry = rcu_dereference(mcelog.next); 169 entry = rcu_dereference_check_mce(mcelog.next);
162 for (;;) { 170 for (;;) {
163 /* 171 /*
164 * When the buffer fills up discard new entries. 172 * When the buffer fills up discard new entries.
@@ -531,7 +539,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
531 struct mce m; 539 struct mce m;
532 int i; 540 int i;
533 541
534 __get_cpu_var(mce_poll_count)++; 542 percpu_inc(mce_poll_count);
535 543
536 mce_setup(&m); 544 mce_setup(&m);
537 545
@@ -926,7 +934,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
926 934
927 atomic_inc(&mce_entry); 935 atomic_inc(&mce_entry);
928 936
929 __get_cpu_var(mce_exception_count)++; 937 percpu_inc(mce_exception_count);
930 938
931 if (notify_die(DIE_NMI, "machine check", regs, error_code, 939 if (notify_die(DIE_NMI, "machine check", regs, error_code,
932 18, SIGKILL) == NOTIFY_STOP) 940 18, SIGKILL) == NOTIFY_STOP)
@@ -1485,8 +1493,6 @@ static void collect_tscs(void *data)
1485 rdtscll(cpu_tsc[smp_processor_id()]); 1493 rdtscll(cpu_tsc[smp_processor_id()]);
1486} 1494}
1487 1495
1488static DEFINE_MUTEX(mce_read_mutex);
1489
1490static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1491 loff_t *off) 1497 loff_t *off)
1492{ 1498{
@@ -1500,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1500 return -ENOMEM; 1506 return -ENOMEM;
1501 1507
1502 mutex_lock(&mce_read_mutex); 1508 mutex_lock(&mce_read_mutex);
1503 next = rcu_dereference(mcelog.next); 1509 next = rcu_dereference_check_mce(mcelog.next);
1504 1510
1505 /* Only supports full reads right now */ 1511 /* Only supports full reads right now */
1506 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1565,7 +1571,7 @@ timeout:
1565static unsigned int mce_poll(struct file *file, poll_table *wait) 1571static unsigned int mce_poll(struct file *file, poll_table *wait)
1566{ 1572{
1567 poll_wait(file, &mce_wait, wait); 1573 poll_wait(file, &mce_wait, wait);
1568 if (rcu_dereference(mcelog.next)) 1574 if (rcu_dereference_check_mce(mcelog.next))
1569 return POLLIN | POLLRDNORM; 1575 return POLLIN | POLLRDNORM;
1570 return 0; 1576 return 0;
1571} 1577}
@@ -2044,6 +2050,7 @@ static __init void mce_init_banks(void)
2044 struct mce_bank *b = &mce_banks[i]; 2050 struct mce_bank *b = &mce_banks[i];
2045 struct sysdev_attribute *a = &b->attr; 2051 struct sysdev_attribute *a = &b->attr;
2046 2052
2053 sysfs_attr_init(&a->attr);
2047 a->attr.name = b->attrname; 2054 a->attr.name = b->attrname;
2048 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2055 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2049 2056
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efca..224392d8fe8c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -21,6 +21,7 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/smp.h> 27#include <linux/smp.h>
@@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
388 return ret; 389 return ret;
389} 390}
390 391
391static struct sysfs_ops threshold_ops = { 392static const struct sysfs_ops threshold_ops = {
392 .show = show, 393 .show = show,
393 .store = store, 394 .store = store,
394}; 395};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 7c785634af2b..62b48e40920a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -5,6 +5,7 @@
5 * Author: Andi Kleen 5 * Author: Andi Kleen
6 */ 6 */
7 7
8#include <linux/gfp.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/percpu.h> 11#include <linux/percpu.h>
@@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot)
95 96
96 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
97 if (val & CMCI_EN) { 98 if (val & CMCI_EN) {
98 if (test_and_clear_bit(i, owned) || boot) 99 if (test_and_clear_bit(i, owned) && !boot)
99 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
100 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
101 continue; 102 continue;
@@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot)
107 108
108 /* Did the enable bit stick? -- the bank supports CMCI */ 109 /* Did the enable bit stick? -- the bank supports CMCI */
109 if (val & CMCI_EN) { 110 if (val & CMCI_EN) {
110 if (!test_and_set_bit(i, owned) || boot) 111 if (!test_and_set_bit(i, owned) && !boot)
111 print_update("CMCI", &hdr, i); 112 print_update("CMCI", &hdr, i);
112 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 113 __clear_bit(i, __get_cpu_var(mce_poll_banks));
113 } else { 114 } else {
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o cleanup.o 1obj-y := main.o if.o generic.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
108 return 0; 108 return 0;
109} 109}
110 110
111static struct mtrr_ops amd_mtrr_ops = { 111static const struct mtrr_ops amd_mtrr_ops = {
112 .vendor = X86_VENDOR_AMD, 112 .vendor = X86_VENDOR_AMD,
113 .set = amd_set_mtrr, 113 .set = amd_set_mtrr,
114 .get = amd_get_mtrr, 114 .get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
110 return 0; 110 return 0;
111} 111}
112 112
113static struct mtrr_ops centaur_mtrr_ops = { 113static const struct mtrr_ops centaur_mtrr_ops = {
114 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
115 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
116 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e0466..06130b52f012 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
208#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
210 64
211static int __init 65static int __init
212x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
213 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
214 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
215{ 69{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 continue; 77 continue;
224 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
225 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
226 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
227 base + size - 1); 81 base, base + size);
228 } 82 }
229 if (debug_print) { 83 if (debug_print) {
230 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
231 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
232 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
233 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
234 } 88 }
235 89
236 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
252 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
253 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
254 } 108 }
255 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
256 } 110 }
257 if (extra_remove_size) 111 if (extra_remove_size)
258 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
259 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
260 114
261 if (debug_print) { 115 if (debug_print) {
262 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
263 for (i = 0; i < RANGE_NUM; i++) { 117 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end) 118 if (!range[i].end)
265 continue; 119 continue;
266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
267 range[i].start, range[i].end + 1); 121 range[i].start, range[i].end);
268 } 122 }
269 } 123 }
270 124
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
273 if (debug_print) { 127 if (debug_print) {
274 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
275 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
276 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
277 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
278 } 132 }
279 133
280 /* clear those is not used */
281 for (i = nr_range; i < RANGE_NUM; i++)
282 memset(&range[i], 0, sizeof(range[i]));
283
284 return nr_range; 134 return nr_range;
285} 135}
286 136
287#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
288 138
289static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
290{ 140{
291 unsigned long sum = 0; 141 unsigned long sum = 0;
292 int i; 142 int i;
293 143
294 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
295 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
296 146
297 return sum; 147 return sum;
298} 148}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
621early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
622 472
623static int __init 473static int __init
624x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
625 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
626{ 476{
627 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
639 /* Write the range: */ 489 /* Write the range: */
640 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
641 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
642 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
643 } 493 }
644 494
645 /* Write the last range: */ 495 /* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
742 unsigned long x_remove_base, 592 unsigned long x_remove_base,
743 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
744{ 594{
745 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
746 unsigned long range_sums_new; 596 unsigned long range_sums_new;
747 static int nr_range_new; 597 static int nr_range_new;
748 int num_reg; 598 int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
869 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
870 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
871 */ 721 */
872 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
873 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
874 /* Sort the ranges: */ 724 /* Sort the ranges: */
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
876 726
877 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1089 nr_range = 0; 939 nr_range = 0;
1090 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1091 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1092 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1093 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1094 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1095 nr_range++; 945 nr_range++;
1096 } 946 }
1097 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1103 953
1104 /* Check the holes: */ 954 /* Check the holes: */
1105 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1106 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1107 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1108 range[i+1].start); 958 range[i+1].start);
1109 } 959 }
1110 960
1111 /* Check the top: */ 961 /* Check the top: */
1112 i = nr_range - 1; 962 i = nr_range - 1;
1113 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1114 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1115 end_pfn); 965 end_pfn);
1116 966
1117 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
265 post_set(); 265 post_set();
266} 266}
267 267
268static struct mtrr_ops cyrix_mtrr_ops = { 268static const struct mtrr_ops cyrix_mtrr_ops = {
269 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
270 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
271 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..fd31a441c61c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/io.h> 9#include <linux/io.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12 11
@@ -464,7 +463,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
464 tmp |= ~((1<<(hi - 1)) - 1); 463 tmp |= ~((1<<(hi - 1)) - 1);
465 464
466 if (tmp != mask_lo) { 465 if (tmp != mask_lo) {
467 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); 466 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
468 mask_lo = tmp; 467 mask_lo = tmp;
469 } 468 }
470 } 469 }
@@ -570,7 +569,7 @@ static unsigned long set_mtrr_state(void)
570 569
571 570
572static unsigned long cr4; 571static unsigned long cr4;
573static DEFINE_SPINLOCK(set_atomicity_lock); 572static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
574 573
575/* 574/*
576 * Since we are disabling the cache don't allow any interrupts, 575 * Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +589,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
590 * changes to the way the kernel boots 589 * changes to the way the kernel boots
591 */ 590 */
592 591
593 spin_lock(&set_atomicity_lock); 592 raw_spin_lock(&set_atomicity_lock);
594 593
595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 594 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
596 cr0 = read_cr0() | X86_CR0_CD; 595 cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +626,7 @@ static void post_set(void) __releases(set_atomicity_lock)
627 /* Restore value of CR4 */ 626 /* Restore value of CR4 */
628 if (cpu_has_pge) 627 if (cpu_has_pge)
629 write_cr4(cr4); 628 write_cr4(cr4);
630 spin_unlock(&set_atomicity_lock); 629 raw_spin_unlock(&set_atomicity_lock);
631} 630}
632 631
633static void generic_set_all(void) 632static void generic_set_all(void)
@@ -752,7 +751,7 @@ int positive_have_wrcomb(void)
752/* 751/*
753 * Generic structure... 752 * Generic structure...
754 */ 753 */
755struct mtrr_ops generic_mtrr_ops = { 754const struct mtrr_ops generic_mtrr_ops = {
756 .use_intel_if = 1, 755 .use_intel_if = 1,
757 .set_all = generic_set_all, 756 .set_all = generic_set_all,
758 .get = generic_get_mtrr, 757 .get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index e006e56f699c..79289632cb27 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
8#include <linux/init.h> 9#include <linux/init.h>
9 10
10#define LINE_SIZE 80 11#define LINE_SIZE 80
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init; 61static bool mtrr_aps_delayed_init;
62 62
63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
64 64
65struct mtrr_ops *mtrr_if; 65const struct mtrr_ops *mtrr_if;
66 66
67static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
69 69
70void set_mtrr_ops(struct mtrr_ops *ops) 70void set_mtrr_ops(const struct mtrr_ops *ops)
71{ 71{
72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
73 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
32extern int generic_validate_add_page(unsigned long base, unsigned long size, 32extern int generic_validate_add_page(unsigned long base, unsigned long size,
33 unsigned int type); 33 unsigned int type);
34 34
35extern struct mtrr_ops generic_mtrr_ops; 35extern const struct mtrr_ops generic_mtrr_ops;
36 36
37extern int positive_have_wrcomb(void); 37extern int positive_have_wrcomb(void);
38 38
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
54void get_mtrr_state(void); 54void get_mtrr_state(void);
55 55
56extern void set_mtrr_ops(struct mtrr_ops *ops); 56extern void set_mtrr_ops(const struct mtrr_ops *ops);
57 57
58extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
59extern struct mtrr_ops *mtrr_if; 59extern const struct mtrr_ops *mtrr_if;
60 60
61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
1#include <linux/init.h>
2#include <linux/io.h>
3#include <linux/mm.h>
4
5#include <asm/processor-cyrix.h>
6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
10#include "mtrr.h"
11
12/* Put the processor into a state where MTRRs can be safely set */
13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
14{
15 unsigned int cr0;
16
17 /* Disable interrupts locally */
18 local_irq_save(ctxt->flags);
19
20 if (use_intel() || is_cpu(CYRIX)) {
21
22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
23 if (cpu_has_pge) {
24 ctxt->cr4val = read_cr4();
25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
26 }
27
28 /*
29 * Disable and flush caches. Note that wbinvd flushes the TLBs
30 * as a side-effect
31 */
32 cr0 = read_cr0() | X86_CR0_CD;
33 wbinvd();
34 write_cr0(cr0);
35 wbinvd();
36
37 if (use_intel()) {
38 /* Save MTRR state */
39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
40 } else {
41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
47 }
48}
49
50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
51{
52 if (use_intel()) {
53 /* Disable MTRRs, and set the default type to uncached */
54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
55 ctxt->deftype_hi);
56 } else {
57 if (is_cpu(CYRIX)) {
58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
62}
63
64/* Restore the processor after a set_mtrr_prepare */
65void set_mtrr_done(struct set_mtrr_context *ctxt)
66{
67 if (use_intel() || is_cpu(CYRIX)) {
68
69 /* Flush caches and TLBs */
70 wbinvd();
71
72 /* Restore MTRRdefType */
73 if (use_intel()) {
74 /* Intel (P6) standard MTRRs */
75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
76 ctxt->deftype_hi);
77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
84
85 /* Enable caches */
86 write_cr0(read_cr0() & 0xbfffffff);
87
88 /* Restore value of CR4 */
89 if (cpu_has_pge)
90 write_cr4(ctxt->cr4val);
91 }
92 /* Re-enable interrupts locally (if enabled previously) */
93 local_irq_restore(ctxt->flags);
94}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c223b7e895d9..db5bdc8addf8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
10 * 11 *
11 * For licencing details see kernel-base/COPYING 12 * For licencing details see kernel-base/COPYING
12 */ 13 */
@@ -20,12 +21,15 @@
20#include <linux/kdebug.h> 21#include <linux/kdebug.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/cpu.h> 26#include <linux/cpu.h>
27#include <linux/bitops.h>
25 28
26#include <asm/apic.h> 29#include <asm/apic.h>
27#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
28#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h>
29 33
30static u64 perf_event_mask __read_mostly; 34static u64 perf_event_mask __read_mostly;
31 35
@@ -68,26 +72,59 @@ struct debug_store {
68 u64 pebs_event_reset[MAX_PEBS_EVENTS]; 72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 73};
70 74
75struct event_constraint {
76 union {
77 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
78 u64 idxmsk64;
79 };
80 u64 code;
81 u64 cmask;
82 int weight;
83};
84
85struct amd_nb {
86 int nb_id; /* NorthBridge id */
87 int refcnt; /* reference count */
88 struct perf_event *owners[X86_PMC_IDX_MAX];
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90};
91
71struct cpu_hw_events { 92struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX]; 93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 95 unsigned long interrupts;
76 int enabled; 96 int enabled;
77 struct debug_store *ds; 97 struct debug_store *ds;
78};
79 98
80struct event_constraint { 99 int n_events;
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 100 int n_added;
82 int code; 101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
104 struct amd_nb *amd_nb;
83}; 105};
84 106
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) } 107#define __EVENT_CONSTRAINT(c, n, m, w) {\
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 } 108 { .idxmsk64 = (n) }, \
109 .code = (c), \
110 .cmask = (m), \
111 .weight = (w), \
112}
113
114#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
87 116
88#define for_each_event_constraint(e, c) \ 117#define INTEL_EVENT_CONSTRAINT(c, n) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++) 118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
90 119
120#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
122
123#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0)
125
126#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++)
91 128
92/* 129/*
93 * struct x86_pmu - generic x86 pmu 130 * struct x86_pmu - generic x86 pmu
@@ -98,8 +135,8 @@ struct x86_pmu {
98 int (*handle_irq)(struct pt_regs *); 135 int (*handle_irq)(struct pt_regs *);
99 void (*disable_all)(void); 136 void (*disable_all)(void);
100 void (*enable_all)(void); 137 void (*enable_all)(void);
101 void (*enable)(struct hw_perf_event *, int); 138 void (*enable)(struct perf_event *);
102 void (*disable)(struct hw_perf_event *, int); 139 void (*disable)(struct perf_event *);
103 unsigned eventsel; 140 unsigned eventsel;
104 unsigned perfctr; 141 unsigned perfctr;
105 u64 (*event_map)(int); 142 u64 (*event_map)(int);
@@ -114,121 +151,28 @@ struct x86_pmu {
114 u64 intel_ctrl; 151 u64 intel_ctrl;
115 void (*enable_bts)(u64 config); 152 void (*enable_bts)(u64 config);
116 void (*disable_bts)(void); 153 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
119};
120 154
121static struct x86_pmu x86_pmu __read_mostly; 155 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event);
122 158
123static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 159 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
124 .enabled = 1, 160 struct perf_event *event);
125}; 161 struct event_constraint *event_constraints;
126 162
127static const struct event_constraint *event_constraints; 163 int (*cpu_prepare)(int cpu);
128 164 void (*cpu_starting)(int cpu);
129/* 165 void (*cpu_dying)(int cpu);
130 * Not sure about some of these 166 void (*cpu_dead)(int cpu);
131 */
132static const u64 p6_perfmon_event_map[] =
133{
134 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
135 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
136 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
137 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
138 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
139 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
140 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
141};
142
143static u64 p6_pmu_event_map(int hw_event)
144{
145 return p6_perfmon_event_map[hw_event];
146}
147
148/*
149 * Event setting that is specified not to count anything.
150 * We use this to effectively disable a counter.
151 *
152 * L2_RQSTS with 0 MESI unit mask.
153 */
154#define P6_NOP_EVENT 0x0000002EULL
155
156static u64 p6_pmu_raw_event(u64 hw_event)
157{
158#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
159#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
160#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
161#define P6_EVNTSEL_INV_MASK 0x00800000ULL
162#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
163
164#define P6_EVNTSEL_MASK \
165 (P6_EVNTSEL_EVENT_MASK | \
166 P6_EVNTSEL_UNIT_MASK | \
167 P6_EVNTSEL_EDGE_MASK | \
168 P6_EVNTSEL_INV_MASK | \
169 P6_EVNTSEL_REG_MASK)
170
171 return hw_event & P6_EVNTSEL_MASK;
172}
173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
184
185/*
186 * Intel PerfMon v3. Used on Core2 and later.
187 */
188static const u64 intel_perfmon_event_map[] =
189{
190 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
191 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
192 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
193 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
194 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
195 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
197}; 167};
198 168
199static const struct event_constraint intel_core_event_constraints[] = 169static struct x86_pmu x86_pmu __read_mostly;
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212 170
213static const struct event_constraint intel_nehalem_event_constraints[] = 171static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
214{ 172 .enabled = 1,
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226}; 173};
227 174
228static u64 intel_pmu_event_map(int hw_event) 175static int x86_perf_event_set_period(struct perf_event *event);
229{
230 return intel_perfmon_event_map[hw_event];
231}
232 176
233/* 177/*
234 * Generalized hw caching related hw_event table, filled 178 * Generalized hw caching related hw_event table, filled
@@ -245,435 +189,18 @@ static u64 __read_mostly hw_cache_event_ids
245 [PERF_COUNT_HW_CACHE_OP_MAX] 189 [PERF_COUNT_HW_CACHE_OP_MAX]
246 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 190 [PERF_COUNT_HW_CACHE_RESULT_MAX];
247 191
248static __initconst u64 nehalem_hw_cache_event_ids
249 [PERF_COUNT_HW_CACHE_MAX]
250 [PERF_COUNT_HW_CACHE_OP_MAX]
251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
252{
253 [ C(L1D) ] = {
254 [ C(OP_READ) ] = {
255 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
256 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
257 },
258 [ C(OP_WRITE) ] = {
259 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
260 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
261 },
262 [ C(OP_PREFETCH) ] = {
263 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
264 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
265 },
266 },
267 [ C(L1I ) ] = {
268 [ C(OP_READ) ] = {
269 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
270 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
271 },
272 [ C(OP_WRITE) ] = {
273 [ C(RESULT_ACCESS) ] = -1,
274 [ C(RESULT_MISS) ] = -1,
275 },
276 [ C(OP_PREFETCH) ] = {
277 [ C(RESULT_ACCESS) ] = 0x0,
278 [ C(RESULT_MISS) ] = 0x0,
279 },
280 },
281 [ C(LL ) ] = {
282 [ C(OP_READ) ] = {
283 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
284 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
285 },
286 [ C(OP_WRITE) ] = {
287 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
288 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
289 },
290 [ C(OP_PREFETCH) ] = {
291 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
292 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
293 },
294 },
295 [ C(DTLB) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
298 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
299 },
300 [ C(OP_WRITE) ] = {
301 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
302 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
303 },
304 [ C(OP_PREFETCH) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0,
306 [ C(RESULT_MISS) ] = 0x0,
307 },
308 },
309 [ C(ITLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
312 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
313 },
314 [ C(OP_WRITE) ] = {
315 [ C(RESULT_ACCESS) ] = -1,
316 [ C(RESULT_MISS) ] = -1,
317 },
318 [ C(OP_PREFETCH) ] = {
319 [ C(RESULT_ACCESS) ] = -1,
320 [ C(RESULT_MISS) ] = -1,
321 },
322 },
323 [ C(BPU ) ] = {
324 [ C(OP_READ) ] = {
325 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
326 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static __initconst u64 core2_hw_cache_event_ids
340 [PERF_COUNT_HW_CACHE_MAX]
341 [PERF_COUNT_HW_CACHE_OP_MAX]
342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
343{
344 [ C(L1D) ] = {
345 [ C(OP_READ) ] = {
346 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
347 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
348 },
349 [ C(OP_WRITE) ] = {
350 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
351 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
352 },
353 [ C(OP_PREFETCH) ] = {
354 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
355 [ C(RESULT_MISS) ] = 0,
356 },
357 },
358 [ C(L1I ) ] = {
359 [ C(OP_READ) ] = {
360 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
361 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
362 },
363 [ C(OP_WRITE) ] = {
364 [ C(RESULT_ACCESS) ] = -1,
365 [ C(RESULT_MISS) ] = -1,
366 },
367 [ C(OP_PREFETCH) ] = {
368 [ C(RESULT_ACCESS) ] = 0,
369 [ C(RESULT_MISS) ] = 0,
370 },
371 },
372 [ C(LL ) ] = {
373 [ C(OP_READ) ] = {
374 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
375 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
376 },
377 [ C(OP_WRITE) ] = {
378 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
379 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
380 },
381 [ C(OP_PREFETCH) ] = {
382 [ C(RESULT_ACCESS) ] = 0,
383 [ C(RESULT_MISS) ] = 0,
384 },
385 },
386 [ C(DTLB) ] = {
387 [ C(OP_READ) ] = {
388 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
389 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
390 },
391 [ C(OP_WRITE) ] = {
392 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
393 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
394 },
395 [ C(OP_PREFETCH) ] = {
396 [ C(RESULT_ACCESS) ] = 0,
397 [ C(RESULT_MISS) ] = 0,
398 },
399 },
400 [ C(ITLB) ] = {
401 [ C(OP_READ) ] = {
402 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
403 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
404 },
405 [ C(OP_WRITE) ] = {
406 [ C(RESULT_ACCESS) ] = -1,
407 [ C(RESULT_MISS) ] = -1,
408 },
409 [ C(OP_PREFETCH) ] = {
410 [ C(RESULT_ACCESS) ] = -1,
411 [ C(RESULT_MISS) ] = -1,
412 },
413 },
414 [ C(BPU ) ] = {
415 [ C(OP_READ) ] = {
416 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
417 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
418 },
419 [ C(OP_WRITE) ] = {
420 [ C(RESULT_ACCESS) ] = -1,
421 [ C(RESULT_MISS) ] = -1,
422 },
423 [ C(OP_PREFETCH) ] = {
424 [ C(RESULT_ACCESS) ] = -1,
425 [ C(RESULT_MISS) ] = -1,
426 },
427 },
428};
429
430static __initconst u64 atom_hw_cache_event_ids
431 [PERF_COUNT_HW_CACHE_MAX]
432 [PERF_COUNT_HW_CACHE_OP_MAX]
433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
434{
435 [ C(L1D) ] = {
436 [ C(OP_READ) ] = {
437 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
438 [ C(RESULT_MISS) ] = 0,
439 },
440 [ C(OP_WRITE) ] = {
441 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
442 [ C(RESULT_MISS) ] = 0,
443 },
444 [ C(OP_PREFETCH) ] = {
445 [ C(RESULT_ACCESS) ] = 0x0,
446 [ C(RESULT_MISS) ] = 0,
447 },
448 },
449 [ C(L1I ) ] = {
450 [ C(OP_READ) ] = {
451 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
452 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
453 },
454 [ C(OP_WRITE) ] = {
455 [ C(RESULT_ACCESS) ] = -1,
456 [ C(RESULT_MISS) ] = -1,
457 },
458 [ C(OP_PREFETCH) ] = {
459 [ C(RESULT_ACCESS) ] = 0,
460 [ C(RESULT_MISS) ] = 0,
461 },
462 },
463 [ C(LL ) ] = {
464 [ C(OP_READ) ] = {
465 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
466 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
467 },
468 [ C(OP_WRITE) ] = {
469 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
470 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
471 },
472 [ C(OP_PREFETCH) ] = {
473 [ C(RESULT_ACCESS) ] = 0,
474 [ C(RESULT_MISS) ] = 0,
475 },
476 },
477 [ C(DTLB) ] = {
478 [ C(OP_READ) ] = {
479 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
480 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
481 },
482 [ C(OP_WRITE) ] = {
483 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
484 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
485 },
486 [ C(OP_PREFETCH) ] = {
487 [ C(RESULT_ACCESS) ] = 0,
488 [ C(RESULT_MISS) ] = 0,
489 },
490 },
491 [ C(ITLB) ] = {
492 [ C(OP_READ) ] = {
493 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
494 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
495 },
496 [ C(OP_WRITE) ] = {
497 [ C(RESULT_ACCESS) ] = -1,
498 [ C(RESULT_MISS) ] = -1,
499 },
500 [ C(OP_PREFETCH) ] = {
501 [ C(RESULT_ACCESS) ] = -1,
502 [ C(RESULT_MISS) ] = -1,
503 },
504 },
505 [ C(BPU ) ] = {
506 [ C(OP_READ) ] = {
507 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
508 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
509 },
510 [ C(OP_WRITE) ] = {
511 [ C(RESULT_ACCESS) ] = -1,
512 [ C(RESULT_MISS) ] = -1,
513 },
514 [ C(OP_PREFETCH) ] = {
515 [ C(RESULT_ACCESS) ] = -1,
516 [ C(RESULT_MISS) ] = -1,
517 },
518 },
519};
520
521static u64 intel_pmu_raw_event(u64 hw_event)
522{
523#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
528
529#define CORE_EVNTSEL_MASK \
530 (CORE_EVNTSEL_EVENT_MASK | \
531 CORE_EVNTSEL_UNIT_MASK | \
532 CORE_EVNTSEL_EDGE_MASK | \
533 CORE_EVNTSEL_INV_MASK | \
534 CORE_EVNTSEL_REG_MASK)
535
536 return hw_event & CORE_EVNTSEL_MASK;
537}
538
539static __initconst u64 amd_hw_cache_event_ids
540 [PERF_COUNT_HW_CACHE_MAX]
541 [PERF_COUNT_HW_CACHE_OP_MAX]
542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
543{
544 [ C(L1D) ] = {
545 [ C(OP_READ) ] = {
546 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
547 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
548 },
549 [ C(OP_WRITE) ] = {
550 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
551 [ C(RESULT_MISS) ] = 0,
552 },
553 [ C(OP_PREFETCH) ] = {
554 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
555 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
556 },
557 },
558 [ C(L1I ) ] = {
559 [ C(OP_READ) ] = {
560 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
561 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
562 },
563 [ C(OP_WRITE) ] = {
564 [ C(RESULT_ACCESS) ] = -1,
565 [ C(RESULT_MISS) ] = -1,
566 },
567 [ C(OP_PREFETCH) ] = {
568 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
569 [ C(RESULT_MISS) ] = 0,
570 },
571 },
572 [ C(LL ) ] = {
573 [ C(OP_READ) ] = {
574 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
575 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
576 },
577 [ C(OP_WRITE) ] = {
578 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
579 [ C(RESULT_MISS) ] = 0,
580 },
581 [ C(OP_PREFETCH) ] = {
582 [ C(RESULT_ACCESS) ] = 0,
583 [ C(RESULT_MISS) ] = 0,
584 },
585 },
586 [ C(DTLB) ] = {
587 [ C(OP_READ) ] = {
588 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
589 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
590 },
591 [ C(OP_WRITE) ] = {
592 [ C(RESULT_ACCESS) ] = 0,
593 [ C(RESULT_MISS) ] = 0,
594 },
595 [ C(OP_PREFETCH) ] = {
596 [ C(RESULT_ACCESS) ] = 0,
597 [ C(RESULT_MISS) ] = 0,
598 },
599 },
600 [ C(ITLB) ] = {
601 [ C(OP_READ) ] = {
602 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
603 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
604 },
605 [ C(OP_WRITE) ] = {
606 [ C(RESULT_ACCESS) ] = -1,
607 [ C(RESULT_MISS) ] = -1,
608 },
609 [ C(OP_PREFETCH) ] = {
610 [ C(RESULT_ACCESS) ] = -1,
611 [ C(RESULT_MISS) ] = -1,
612 },
613 },
614 [ C(BPU ) ] = {
615 [ C(OP_READ) ] = {
616 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
617 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
618 },
619 [ C(OP_WRITE) ] = {
620 [ C(RESULT_ACCESS) ] = -1,
621 [ C(RESULT_MISS) ] = -1,
622 },
623 [ C(OP_PREFETCH) ] = {
624 [ C(RESULT_ACCESS) ] = -1,
625 [ C(RESULT_MISS) ] = -1,
626 },
627 },
628};
629
630/*
631 * AMD Performance Monitor K7 and later.
632 */
633static const u64 amd_perfmon_event_map[] =
634{
635 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
636 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
637 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
638 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
639 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
640 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
641};
642
643static u64 amd_pmu_event_map(int hw_event)
644{
645 return amd_perfmon_event_map[hw_event];
646}
647
648static u64 amd_pmu_raw_event(u64 hw_event)
649{
650#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
651#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
652#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
653#define K7_EVNTSEL_INV_MASK 0x000800000ULL
654#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
655
656#define K7_EVNTSEL_MASK \
657 (K7_EVNTSEL_EVENT_MASK | \
658 K7_EVNTSEL_UNIT_MASK | \
659 K7_EVNTSEL_EDGE_MASK | \
660 K7_EVNTSEL_INV_MASK | \
661 K7_EVNTSEL_REG_MASK)
662
663 return hw_event & K7_EVNTSEL_MASK;
664}
665
666/* 192/*
667 * Propagate event elapsed time into the generic event. 193 * Propagate event elapsed time into the generic event.
668 * Can only be executed on the CPU where the event is active. 194 * Can only be executed on the CPU where the event is active.
669 * Returns the delta events processed. 195 * Returns the delta events processed.
670 */ 196 */
671static u64 197static u64
672x86_perf_event_update(struct perf_event *event, 198x86_perf_event_update(struct perf_event *event)
673 struct hw_perf_event *hwc, int idx)
674{ 199{
200 struct hw_perf_event *hwc = &event->hw;
675 int shift = 64 - x86_pmu.event_bits; 201 int shift = 64 - x86_pmu.event_bits;
676 u64 prev_raw_count, new_raw_count; 202 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx;
677 s64 delta; 204 s64 delta;
678 205
679 if (idx == X86_PMC_IDX_FIXED_BTS) 206 if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -773,7 +300,7 @@ static inline bool bts_available(void)
773 return x86_pmu.enable_bts != NULL; 300 return x86_pmu.enable_bts != NULL;
774} 301}
775 302
776static inline void init_debug_store_on_cpu(int cpu) 303static void init_debug_store_on_cpu(int cpu)
777{ 304{
778 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
779 306
@@ -785,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu)
785 (u32)((u64)(unsigned long)ds >> 32)); 312 (u32)((u64)(unsigned long)ds >> 32));
786} 313}
787 314
788static inline void fini_debug_store_on_cpu(int cpu) 315static void fini_debug_store_on_cpu(int cpu)
789{ 316{
790 if (!per_cpu(cpu_hw_events, cpu).ds) 317 if (!per_cpu(cpu_hw_events, cpu).ds)
791 return; 318 return;
@@ -914,42 +441,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
914 return 0; 441 return 0;
915} 442}
916 443
917static void intel_pmu_enable_bts(u64 config)
918{
919 unsigned long debugctlmsr;
920
921 debugctlmsr = get_debugctlmsr();
922
923 debugctlmsr |= X86_DEBUGCTL_TR;
924 debugctlmsr |= X86_DEBUGCTL_BTS;
925 debugctlmsr |= X86_DEBUGCTL_BTINT;
926
927 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
928 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
929
930 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
931 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
932
933 update_debugctlmsr(debugctlmsr);
934}
935
936static void intel_pmu_disable_bts(void)
937{
938 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
939 unsigned long debugctlmsr;
940
941 if (!cpuc->ds)
942 return;
943
944 debugctlmsr = get_debugctlmsr();
945
946 debugctlmsr &=
947 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
948 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
949
950 update_debugctlmsr(debugctlmsr);
951}
952
953/* 444/*
954 * Setup the hardware configuration for a given attr_type 445 * Setup the hardware configuration for a given attr_type
955 */ 446 */
@@ -988,6 +479,8 @@ static int __hw_perf_event_init(struct perf_event *event)
988 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
989 480
990 hwc->idx = -1; 481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
991 484
992 /* 485 /*
993 * Count user and OS events unless requested not to. 486 * Count user and OS events unless requested not to.
@@ -1017,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event)
1017 */ 510 */
1018 if (attr->type == PERF_TYPE_RAW) { 511 if (attr->type == PERF_TYPE_RAW) {
1019 hwc->config |= x86_pmu.raw_event(attr->config); 512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
1020 return 0; 516 return 0;
1021 } 517 }
1022 518
@@ -1056,216 +552,314 @@ static int __hw_perf_event_init(struct perf_event *event)
1056 return 0; 552 return 0;
1057} 553}
1058 554
1059static void p6_pmu_disable_all(void) 555static void x86_pmu_disable_all(void)
1060{ 556{
1061 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1062 u64 val; 558 int idx;
1063
1064 if (!cpuc->enabled)
1065 return;
1066 559
1067 cpuc->enabled = 0; 560 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1068 barrier(); 561 u64 val;
1069 562
1070 /* p6 only has one enable register */ 563 if (!test_bit(idx, cpuc->active_mask))
1071 rdmsrl(MSR_P6_EVNTSEL0, val); 564 continue;
1072 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 565 rdmsrl(x86_pmu.eventsel + idx, val);
1073 wrmsrl(MSR_P6_EVNTSEL0, val); 566 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
567 continue;
568 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
569 wrmsrl(x86_pmu.eventsel + idx, val);
570 }
1074} 571}
1075 572
1076static void intel_pmu_disable_all(void) 573void hw_perf_disable(void)
1077{ 574{
1078 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 575 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1079 576
577 if (!x86_pmu_initialized())
578 return;
579
1080 if (!cpuc->enabled) 580 if (!cpuc->enabled)
1081 return; 581 return;
1082 582
583 cpuc->n_added = 0;
1083 cpuc->enabled = 0; 584 cpuc->enabled = 0;
1084 barrier(); 585 barrier();
1085 586
1086 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 587 x86_pmu.disable_all();
1087
1088 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1089 intel_pmu_disable_bts();
1090} 588}
1091 589
1092static void amd_pmu_disable_all(void) 590static void x86_pmu_enable_all(void)
1093{ 591{
1094 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1095 int idx; 593 int idx;
1096 594
1097 if (!cpuc->enabled)
1098 return;
1099
1100 cpuc->enabled = 0;
1101 /*
1102 * ensure we write the disable before we start disabling the
1103 * events proper, so that amd_pmu_enable_event() does the
1104 * right thing.
1105 */
1106 barrier();
1107
1108 for (idx = 0; idx < x86_pmu.num_events; idx++) { 595 for (idx = 0; idx < x86_pmu.num_events; idx++) {
596 struct perf_event *event = cpuc->events[idx];
1109 u64 val; 597 u64 val;
1110 598
1111 if (!test_bit(idx, cpuc->active_mask)) 599 if (!test_bit(idx, cpuc->active_mask))
1112 continue; 600 continue;
1113 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 601
1114 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 602 val = event->hw.config;
1115 continue; 603 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
1116 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 604 wrmsrl(x86_pmu.eventsel + idx, val);
1117 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1118 } 605 }
1119} 606}
1120 607
1121void hw_perf_disable(void) 608static const struct pmu pmu;
609
610static inline int is_x86_event(struct perf_event *event)
1122{ 611{
1123 if (!x86_pmu_initialized()) 612 return event->pmu == &pmu;
1124 return;
1125 return x86_pmu.disable_all();
1126} 613}
1127 614
1128static void p6_pmu_enable_all(void) 615static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1129{ 616{
1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 617 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
1131 unsigned long val; 618 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
619 int i, j, w, wmax, num = 0;
620 struct hw_perf_event *hwc;
1132 621
1133 if (cpuc->enabled) 622 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1134 return;
1135 623
1136 cpuc->enabled = 1; 624 for (i = 0; i < n; i++) {
1137 barrier(); 625 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
626 constraints[i] = c;
627 }
1138 628
1139 /* p6 only has one enable register */ 629 /*
1140 rdmsrl(MSR_P6_EVNTSEL0, val); 630 * fastpath, try to reuse previous register
1141 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 631 */
1142 wrmsrl(MSR_P6_EVNTSEL0, val); 632 for (i = 0; i < n; i++) {
1143} 633 hwc = &cpuc->event_list[i]->hw;
634 c = constraints[i];
1144 635
1145static void intel_pmu_enable_all(void) 636 /* never assigned */
1146{ 637 if (hwc->idx == -1)
1147 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 638 break;
1148 639
1149 if (cpuc->enabled) 640 /* constraint still honored */
1150 return; 641 if (!test_bit(hwc->idx, c->idxmsk))
642 break;
1151 643
1152 cpuc->enabled = 1; 644 /* not already used */
1153 barrier(); 645 if (test_bit(hwc->idx, used_mask))
646 break;
1154 647
1155 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 648 __set_bit(hwc->idx, used_mask);
649 if (assign)
650 assign[i] = hwc->idx;
651 }
652 if (i == n)
653 goto done;
1156 654
1157 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 655 /*
1158 struct perf_event *event = 656 * begin slow path
1159 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 657 */
1160 658
1161 if (WARN_ON_ONCE(!event)) 659 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1162 return;
1163 660
1164 intel_pmu_enable_bts(event->hw.config); 661 /*
1165 } 662 * weight = number of possible counters
1166} 663 *
664 * 1 = most constrained, only works on one counter
665 * wmax = least constrained, works on any counter
666 *
667 * assign events to counters starting with most
668 * constrained events.
669 */
670 wmax = x86_pmu.num_events;
1167 671
1168static void amd_pmu_enable_all(void) 672 /*
1169{ 673 * when fixed event counters are present,
1170 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 674 * wmax is incremented by 1 to account
1171 int idx; 675 * for one more choice
676 */
677 if (x86_pmu.num_events_fixed)
678 wmax++;
1172 679
1173 if (cpuc->enabled) 680 for (w = 1, num = n; num && w <= wmax; w++) {
1174 return; 681 /* for each event */
682 for (i = 0; num && i < n; i++) {
683 c = constraints[i];
684 hwc = &cpuc->event_list[i]->hw;
1175 685
1176 cpuc->enabled = 1; 686 if (c->weight != w)
1177 barrier(); 687 continue;
1178 688
1179 for (idx = 0; idx < x86_pmu.num_events; idx++) { 689 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
1180 struct perf_event *event = cpuc->events[idx]; 690 if (!test_bit(j, used_mask))
1181 u64 val; 691 break;
692 }
1182 693
1183 if (!test_bit(idx, cpuc->active_mask)) 694 if (j == X86_PMC_IDX_MAX)
1184 continue; 695 break;
1185 696
1186 val = event->hw.config; 697 __set_bit(j, used_mask);
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 698
1188 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 699 if (assign)
700 assign[i] = j;
701 num--;
702 }
1189 } 703 }
704done:
705 /*
706 * scheduling failed or is just a simulation,
707 * free resources if necessary
708 */
709 if (!assign || num) {
710 for (i = 0; i < n; i++) {
711 if (x86_pmu.put_event_constraints)
712 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
713 }
714 }
715 return num ? -ENOSPC : 0;
1190} 716}
1191 717
1192void hw_perf_enable(void) 718/*
719 * dogrp: true if must collect siblings events (group)
720 * returns total number of events and error code
721 */
722static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1193{ 723{
1194 if (!x86_pmu_initialized()) 724 struct perf_event *event;
1195 return; 725 int n, max_count;
1196 x86_pmu.enable_all();
1197}
1198 726
1199static inline u64 intel_pmu_get_status(void) 727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
1200{
1201 u64 status;
1202 728
1203 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 729 /* current number of events already accepted */
730 n = cpuc->n_events;
1204 731
1205 return status; 732 if (is_x86_event(leader)) {
1206} 733 if (n >= max_count)
734 return -ENOSPC;
735 cpuc->event_list[n] = leader;
736 n++;
737 }
738 if (!dogrp)
739 return n;
1207 740
1208static inline void intel_pmu_ack_status(u64 ack) 741 list_for_each_entry(event, &leader->sibling_list, group_entry) {
1209{ 742 if (!is_x86_event(event) ||
1210 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 743 event->state <= PERF_EVENT_STATE_OFF)
1211} 744 continue;
1212 745
1213static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 746 if (n >= max_count)
1214{ 747 return -ENOSPC;
1215 (void)checking_wrmsrl(hwc->config_base + idx,
1216 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1217}
1218 748
1219static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 749 cpuc->event_list[n] = event;
1220{ 750 n++;
1221 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 751 }
752 return n;
1222} 753}
1223 754
1224static inline void 755static inline void x86_assign_hw_event(struct perf_event *event,
1225intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 756 struct cpu_hw_events *cpuc, int i)
1226{ 757{
1227 int idx = __idx - X86_PMC_IDX_FIXED; 758 struct hw_perf_event *hwc = &event->hw;
1228 u64 ctrl_val, mask;
1229 759
1230 mask = 0xfULL << (idx * 4); 760 hwc->idx = cpuc->assign[i];
761 hwc->last_cpu = smp_processor_id();
762 hwc->last_tag = ++cpuc->tags[i];
1231 763
1232 rdmsrl(hwc->config_base, ctrl_val); 764 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
1233 ctrl_val &= ~mask; 765 hwc->config_base = 0;
1234 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 766 hwc->event_base = 0;
767 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
768 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
769 /*
770 * We set it so that event_base + idx in wrmsr/rdmsr maps to
771 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
772 */
773 hwc->event_base =
774 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
775 } else {
776 hwc->config_base = x86_pmu.eventsel;
777 hwc->event_base = x86_pmu.perfctr;
778 }
1235} 779}
1236 780
1237static inline void 781static inline int match_prev_assignment(struct hw_perf_event *hwc,
1238p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 782 struct cpu_hw_events *cpuc,
783 int i)
1239{ 784{
1240 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 785 return hwc->idx == cpuc->assign[i] &&
1241 u64 val = P6_NOP_EVENT; 786 hwc->last_cpu == smp_processor_id() &&
1242 787 hwc->last_tag == cpuc->tags[i];
1243 if (cpuc->enabled)
1244 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1245
1246 (void)checking_wrmsrl(hwc->config_base + idx, val);
1247} 788}
1248 789
1249static inline void 790static int x86_pmu_start(struct perf_event *event);
1250intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 791static void x86_pmu_stop(struct perf_event *event);
792
793void hw_perf_enable(void)
1251{ 794{
1252 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1253 intel_pmu_disable_bts(); 796 struct perf_event *event;
797 struct hw_perf_event *hwc;
798 int i;
799
800 if (!x86_pmu_initialized())
1254 return; 801 return;
1255 }
1256 802
1257 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 803 if (cpuc->enabled)
1258 intel_pmu_disable_fixed(hwc, idx);
1259 return; 804 return;
805
806 if (cpuc->n_added) {
807 int n_running = cpuc->n_events - cpuc->n_added;
808 /*
809 * apply assignment obtained either from
810 * hw_perf_group_sched_in() or x86_pmu_enable()
811 *
812 * step1: save events moving to new counters
813 * step2: reprogram moved events into new counters
814 */
815 for (i = 0; i < n_running; i++) {
816 event = cpuc->event_list[i];
817 hwc = &event->hw;
818
819 /*
820 * we can avoid reprogramming counter if:
821 * - assigned same counter as last time
822 * - running on same CPU as last time
823 * - no other event has used the counter since
824 */
825 if (hwc->idx == -1 ||
826 match_prev_assignment(hwc, cpuc, i))
827 continue;
828
829 x86_pmu_stop(event);
830 }
831
832 for (i = 0; i < cpuc->n_events; i++) {
833 event = cpuc->event_list[i];
834 hwc = &event->hw;
835
836 if (!match_prev_assignment(hwc, cpuc, i))
837 x86_assign_hw_event(event, cpuc, i);
838 else if (i < n_running)
839 continue;
840
841 x86_pmu_start(event);
842 }
843 cpuc->n_added = 0;
844 perf_events_lapic_init();
1260 } 845 }
1261 846
1262 x86_pmu_disable_event(hwc, idx); 847 cpuc->enabled = 1;
848 barrier();
849
850 x86_pmu.enable_all();
1263} 851}
1264 852
1265static inline void 853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
1266amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1267{ 854{
1268 x86_pmu_disable_event(hwc, idx); 855 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857}
858
859static inline void x86_pmu_disable_event(struct perf_event *event)
860{
861 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
1269} 863}
1270 864
1271static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1275,12 +869,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1275 * To be called with the event disabled in hw: 869 * To be called with the event disabled in hw:
1276 */ 870 */
1277static int 871static int
1278x86_perf_event_set_period(struct perf_event *event, 872x86_perf_event_set_period(struct perf_event *event)
1279 struct hw_perf_event *hwc, int idx)
1280{ 873{
874 struct hw_perf_event *hwc = &event->hw;
1281 s64 left = atomic64_read(&hwc->period_left); 875 s64 left = atomic64_read(&hwc->period_left);
1282 s64 period = hwc->sample_period; 876 s64 period = hwc->sample_period;
1283 int err, ret = 0; 877 int err, ret = 0, idx = hwc->idx;
1284 878
1285 if (idx == X86_PMC_IDX_FIXED_BTS) 879 if (idx == X86_PMC_IDX_FIXED_BTS)
1286 return 0; 880 return 0;
@@ -1326,212 +920,63 @@ x86_perf_event_set_period(struct perf_event *event,
1326 return ret; 920 return ret;
1327} 921}
1328 922
1329static inline void 923static void x86_pmu_enable_event(struct perf_event *event)
1330intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1331{
1332 int idx = __idx - X86_PMC_IDX_FIXED;
1333 u64 ctrl_val, bits, mask;
1334 int err;
1335
1336 /*
1337 * Enable IRQ generation (0x8),
1338 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
1339 * if requested:
1340 */
1341 bits = 0x8ULL;
1342 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
1343 bits |= 0x2;
1344 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1345 bits |= 0x1;
1346 bits <<= (idx * 4);
1347 mask = 0xfULL << (idx * 4);
1348
1349 rdmsrl(hwc->config_base, ctrl_val);
1350 ctrl_val &= ~mask;
1351 ctrl_val |= bits;
1352 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1353}
1354
1355static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1356{ 924{
1357 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1358 u64 val;
1359
1360 val = hwc->config;
1361 if (cpuc->enabled) 926 if (cpuc->enabled)
1362 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 927 __x86_pmu_enable_event(&event->hw);
1363
1364 (void)checking_wrmsrl(hwc->config_base + idx, val);
1365} 928}
1366 929
1367 930/*
1368static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) 931 * activate a single event
1369{ 932 *
1370 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 933 * The event is added to the group of enabled events
1371 if (!__get_cpu_var(cpu_hw_events).enabled) 934 * but only if it can be scehduled with existing events.
1372 return; 935 *
1373 936 * Called with PMU disabled. If successful and return value 1,
1374 intel_pmu_enable_bts(hwc->config); 937 * then guaranteed to call perf_enable() and hw_perf_enable()
1375 return; 938 */
1376 } 939static int x86_pmu_enable(struct perf_event *event)
1377
1378 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1379 intel_pmu_enable_fixed(hwc, idx);
1380 return;
1381 }
1382
1383 x86_pmu_enable_event(hwc, idx);
1384}
1385
1386static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1387{ 940{
1388 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 941 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
942 struct hw_perf_event *hwc;
943 int assign[X86_PMC_IDX_MAX];
944 int n, n0, ret;
1389 945
1390 if (cpuc->enabled) 946 hwc = &event->hw;
1391 x86_pmu_enable_event(hwc, idx);
1392}
1393
1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1395{
1396 unsigned int hw_event;
1397
1398 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1399
1400 if (unlikely((hw_event ==
1401 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1402 (hwc->sample_period == 1)))
1403 return X86_PMC_IDX_FIXED_BTS;
1404 947
1405 if (!x86_pmu.num_events_fixed) 948 n0 = cpuc->n_events;
1406 return -1; 949 n = collect_events(cpuc, event, false);
950 if (n < 0)
951 return n;
1407 952
953 ret = x86_schedule_events(cpuc, n, assign);
954 if (ret)
955 return ret;
1408 /* 956 /*
1409 * fixed counters do not take all possible filters 957 * copy new assignment, now we know it is possible
958 * will be used by hw_perf_enable()
1410 */ 959 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK) 960 memcpy(cpuc->assign, assign, n*sizeof(int));
1412 return -1;
1413
1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1417 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1418 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1419 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1420
1421 return -1;
1422}
1423
1424/*
1425 * generic counter allocator: get next free counter
1426 */
1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429{
1430 int idx;
1431
1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435 961
1436/* 962 cpuc->n_events = n;
1437 * intel-specific counter allocator: check event constraints 963 cpuc->n_added += n - n0;
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444 964
1445 if (!event_constraints) 965 return 0;
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1465{
1466 int idx;
1467
1468 idx = fixed_mode_idx(hwc);
1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1470 /* BTS is already occupied. */
1471 if (test_and_set_bit(idx, cpuc->used_mask))
1472 return -EAGAIN;
1473
1474 hwc->config_base = 0;
1475 hwc->event_base = 0;
1476 hwc->idx = idx;
1477 } else if (idx >= 0) {
1478 /*
1479 * Try to get the fixed event, if that is already taken
1480 * then try to get a generic event:
1481 */
1482 if (test_and_set_bit(idx, cpuc->used_mask))
1483 goto try_generic;
1484
1485 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1486 /*
1487 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1488 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1489 */
1490 hwc->event_base =
1491 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1492 hwc->idx = idx;
1493 } else {
1494 idx = hwc->idx;
1495 /* Try to get the previous generic event again */
1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1497try_generic:
1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1499 if (idx == -1)
1500 return -EAGAIN;
1501
1502 set_bit(idx, cpuc->used_mask);
1503 hwc->idx = idx;
1504 }
1505 hwc->config_base = x86_pmu.eventsel;
1506 hwc->event_base = x86_pmu.perfctr;
1507 }
1508
1509 return idx;
1510} 966}
1511 967
1512/* 968static int x86_pmu_start(struct perf_event *event)
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{ 969{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 970 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw; 971 int idx = event->hw.idx;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1525 perf_events_lapic_init();
1526 972
1527 x86_pmu.disable(hwc, idx); 973 if (idx == -1)
974 return -EAGAIN;
1528 975
976 x86_perf_event_set_period(event);
1529 cpuc->events[idx] = event; 977 cpuc->events[idx] = event;
1530 set_bit(idx, cpuc->active_mask); 978 __set_bit(idx, cpuc->active_mask);
1531 979 x86_pmu.enable(event);
1532 x86_perf_event_set_period(event, hwc, idx);
1533 x86_pmu.enable(hwc, idx);
1534
1535 perf_event_update_userpage(event); 980 perf_event_update_userpage(event);
1536 981
1537 return 0; 982 return 0;
@@ -1539,14 +984,8 @@ static int x86_pmu_enable(struct perf_event *event)
1539 984
1540static void x86_pmu_unthrottle(struct perf_event *event) 985static void x86_pmu_unthrottle(struct perf_event *event)
1541{ 986{
1542 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 987 int ret = x86_pmu_start(event);
1543 struct hw_perf_event *hwc = &event->hw; 988 WARN_ON_ONCE(ret);
1544
1545 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1546 cpuc->events[hwc->idx] != event))
1547 return;
1548
1549 x86_pmu.enable(hwc, hwc->idx);
1550} 989}
1551 990
1552void perf_event_print_debug(void) 991void perf_event_print_debug(void)
@@ -1576,7 +1015,7 @@ void perf_event_print_debug(void)
1576 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1577 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1578 } 1017 }
1579 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1580 1019
1581 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1020 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1582 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1600,257 +1039,50 @@ void perf_event_print_debug(void)
1600 local_irq_restore(flags); 1039 local_irq_restore(flags);
1601} 1040}
1602 1041
1603static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) 1042static void x86_pmu_stop(struct perf_event *event)
1604{
1605 struct debug_store *ds = cpuc->ds;
1606 struct bts_record {
1607 u64 from;
1608 u64 to;
1609 u64 flags;
1610 };
1611 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1612 struct bts_record *at, *top;
1613 struct perf_output_handle handle;
1614 struct perf_event_header header;
1615 struct perf_sample_data data;
1616 struct pt_regs regs;
1617
1618 if (!event)
1619 return;
1620
1621 if (!ds)
1622 return;
1623
1624 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1625 top = (struct bts_record *)(unsigned long)ds->bts_index;
1626
1627 if (top <= at)
1628 return;
1629
1630 ds->bts_index = ds->bts_buffer_base;
1631
1632
1633 data.period = event->hw.last_period;
1634 data.addr = 0;
1635 data.raw = NULL;
1636 regs.ip = 0;
1637
1638 /*
1639 * Prepare a generic sample, i.e. fill in the invariant fields.
1640 * We will overwrite the from and to address before we output
1641 * the sample.
1642 */
1643 perf_prepare_sample(&header, &data, event, &regs);
1644
1645 if (perf_output_begin(&handle, event,
1646 header.size * (top - at), 1, 1))
1647 return;
1648
1649 for (; at < top; at++) {
1650 data.ip = at->from;
1651 data.addr = at->to;
1652
1653 perf_output_sample(&handle, &header, &data, event);
1654 }
1655
1656 perf_output_end(&handle);
1657
1658 /* There's new data available. */
1659 event->hw.interrupts++;
1660 event->pending_kill = POLL_IN;
1661}
1662
1663static void x86_pmu_disable(struct perf_event *event)
1664{ 1043{
1665 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1044 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1666 struct hw_perf_event *hwc = &event->hw; 1045 struct hw_perf_event *hwc = &event->hw;
1667 int idx = hwc->idx; 1046 int idx = hwc->idx;
1668 1047
1669 /* 1048 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1670 * Must be done before we disable, otherwise the nmi handler 1049 return;
1671 * could reenable again:
1672 */
1673 clear_bit(idx, cpuc->active_mask);
1674 x86_pmu.disable(hwc, idx);
1675 1050
1676 /* 1051 x86_pmu.disable(event);
1677 * Make sure the cleared pointer becomes visible before we
1678 * (potentially) free the event:
1679 */
1680 barrier();
1681 1052
1682 /* 1053 /*
1683 * Drain the remaining delta count out of a event 1054 * Drain the remaining delta count out of a event
1684 * that we are disabling: 1055 * that we are disabling:
1685 */ 1056 */
1686 x86_perf_event_update(event, hwc, idx); 1057 x86_perf_event_update(event);
1687
1688 /* Drain the remaining BTS records. */
1689 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1690 intel_pmu_drain_bts_buffer(cpuc);
1691 1058
1692 cpuc->events[idx] = NULL; 1059 cpuc->events[idx] = NULL;
1693 clear_bit(idx, cpuc->used_mask);
1694
1695 perf_event_update_userpage(event);
1696}
1697
1698/*
1699 * Save and restart an expired event. Called by NMI contexts,
1700 * so it has to be careful about preempting normal event ops:
1701 */
1702static int intel_pmu_save_and_restart(struct perf_event *event)
1703{
1704 struct hw_perf_event *hwc = &event->hw;
1705 int idx = hwc->idx;
1706 int ret;
1707
1708 x86_perf_event_update(event, hwc, idx);
1709 ret = x86_perf_event_set_period(event, hwc, idx);
1710
1711 if (event->state == PERF_EVENT_STATE_ACTIVE)
1712 intel_pmu_enable_event(hwc, idx);
1713
1714 return ret;
1715}
1716
1717static void intel_pmu_reset(void)
1718{
1719 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1720 unsigned long flags;
1721 int idx;
1722
1723 if (!x86_pmu.num_events)
1724 return;
1725
1726 local_irq_save(flags);
1727
1728 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1729
1730 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1731 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1732 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1733 }
1734 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1735 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1736 }
1737 if (ds)
1738 ds->bts_index = ds->bts_buffer_base;
1739
1740 local_irq_restore(flags);
1741}
1742
1743static int p6_pmu_handle_irq(struct pt_regs *regs)
1744{
1745 struct perf_sample_data data;
1746 struct cpu_hw_events *cpuc;
1747 struct perf_event *event;
1748 struct hw_perf_event *hwc;
1749 int idx, handled = 0;
1750 u64 val;
1751
1752 data.addr = 0;
1753 data.raw = NULL;
1754
1755 cpuc = &__get_cpu_var(cpu_hw_events);
1756
1757 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1758 if (!test_bit(idx, cpuc->active_mask))
1759 continue;
1760
1761 event = cpuc->events[idx];
1762 hwc = &event->hw;
1763
1764 val = x86_perf_event_update(event, hwc, idx);
1765 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1766 continue;
1767
1768 /*
1769 * event overflow
1770 */
1771 handled = 1;
1772 data.period = event->hw.last_period;
1773
1774 if (!x86_perf_event_set_period(event, hwc, idx))
1775 continue;
1776
1777 if (perf_event_overflow(event, 1, &data, regs))
1778 p6_pmu_disable_event(hwc, idx);
1779 }
1780
1781 if (handled)
1782 inc_irq_stat(apic_perf_irqs);
1783
1784 return handled;
1785} 1060}
1786 1061
1787/* 1062static void x86_pmu_disable(struct perf_event *event)
1788 * This handler is triggered by the local APIC, so the APIC IRQ handling
1789 * rules apply:
1790 */
1791static int intel_pmu_handle_irq(struct pt_regs *regs)
1792{ 1063{
1793 struct perf_sample_data data; 1064 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1794 struct cpu_hw_events *cpuc; 1065 int i;
1795 int bit, loops;
1796 u64 ack, status;
1797
1798 data.addr = 0;
1799 data.raw = NULL;
1800
1801 cpuc = &__get_cpu_var(cpu_hw_events);
1802 1066
1803 perf_disable(); 1067 x86_pmu_stop(event);
1804 intel_pmu_drain_bts_buffer(cpuc);
1805 status = intel_pmu_get_status();
1806 if (!status) {
1807 perf_enable();
1808 return 0;
1809 }
1810 1068
1811 loops = 0; 1069 for (i = 0; i < cpuc->n_events; i++) {
1812again: 1070 if (event == cpuc->event_list[i]) {
1813 if (++loops > 100) {
1814 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1815 perf_event_print_debug();
1816 intel_pmu_reset();
1817 perf_enable();
1818 return 1;
1819 }
1820
1821 inc_irq_stat(apic_perf_irqs);
1822 ack = status;
1823 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1824 struct perf_event *event = cpuc->events[bit];
1825 1071
1826 clear_bit(bit, (unsigned long *) &status); 1072 if (x86_pmu.put_event_constraints)
1827 if (!test_bit(bit, cpuc->active_mask)) 1073 x86_pmu.put_event_constraints(cpuc, event);
1828 continue;
1829 1074
1830 if (!intel_pmu_save_and_restart(event)) 1075 while (++i < cpuc->n_events)
1831 continue; 1076 cpuc->event_list[i-1] = cpuc->event_list[i];
1832 1077
1833 data.period = event->hw.last_period; 1078 --cpuc->n_events;
1834 1079 break;
1835 if (perf_event_overflow(event, 1, &data, regs)) 1080 }
1836 intel_pmu_disable_event(&event->hw, bit);
1837 } 1081 }
1838 1082 perf_event_update_userpage(event);
1839 intel_pmu_ack_status(ack);
1840
1841 /*
1842 * Repeat if there is more work to be done:
1843 */
1844 status = intel_pmu_get_status();
1845 if (status)
1846 goto again;
1847
1848 perf_enable();
1849
1850 return 1;
1851} 1083}
1852 1084
1853static int amd_pmu_handle_irq(struct pt_regs *regs) 1085static int x86_pmu_handle_irq(struct pt_regs *regs)
1854{ 1086{
1855 struct perf_sample_data data; 1087 struct perf_sample_data data;
1856 struct cpu_hw_events *cpuc; 1088 struct cpu_hw_events *cpuc;
@@ -1859,8 +1091,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1859 int idx, handled = 0; 1091 int idx, handled = 0;
1860 u64 val; 1092 u64 val;
1861 1093
1862 data.addr = 0; 1094 perf_sample_data_init(&data, 0);
1863 data.raw = NULL;
1864 1095
1865 cpuc = &__get_cpu_var(cpu_hw_events); 1096 cpuc = &__get_cpu_var(cpu_hw_events);
1866 1097
@@ -1871,7 +1102,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1871 event = cpuc->events[idx]; 1102 event = cpuc->events[idx];
1872 hwc = &event->hw; 1103 hwc = &event->hw;
1873 1104
1874 val = x86_perf_event_update(event, hwc, idx); 1105 val = x86_perf_event_update(event);
1875 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1106 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1876 continue; 1107 continue;
1877 1108
@@ -1881,11 +1112,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1881 handled = 1; 1112 handled = 1;
1882 data.period = event->hw.last_period; 1113 data.period = event->hw.last_period;
1883 1114
1884 if (!x86_perf_event_set_period(event, hwc, idx)) 1115 if (!x86_perf_event_set_period(event))
1885 continue; 1116 continue;
1886 1117
1887 if (perf_event_overflow(event, 1, &data, regs)) 1118 if (perf_event_overflow(event, 1, &data, regs))
1888 amd_pmu_disable_event(hwc, idx); 1119 x86_pmu_stop(event);
1889 } 1120 }
1890 1121
1891 if (handled) 1122 if (handled)
@@ -1968,193 +1199,171 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1968 .priority = 1 1199 .priority = 1
1969}; 1200};
1970 1201
1971static __initconst struct x86_pmu p6_pmu = { 1202static struct event_constraint unconstrained;
1972 .name = "p6", 1203static struct event_constraint emptyconstraint;
1973 .handle_irq = p6_pmu_handle_irq,
1974 .disable_all = p6_pmu_disable_all,
1975 .enable_all = p6_pmu_enable_all,
1976 .enable = p6_pmu_enable_event,
1977 .disable = p6_pmu_disable_event,
1978 .eventsel = MSR_P6_EVNTSEL0,
1979 .perfctr = MSR_P6_PERFCTR0,
1980 .event_map = p6_pmu_event_map,
1981 .raw_event = p6_pmu_raw_event,
1982 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1983 .apic = 1,
1984 .max_period = (1ULL << 31) - 1,
1985 .version = 0,
1986 .num_events = 2,
1987 /*
1988 * Events have 40 bits implemented. However they are designed such
1989 * that bits [32-39] are sign extensions of bit 31. As such the
1990 * effective width of a event for P6-like PMU is 32 bits only.
1991 *
1992 * See IA-32 Intel Architecture Software developer manual Vol 3B
1993 */
1994 .event_bits = 32,
1995 .event_mask = (1ULL << 32) - 1,
1996 .get_event_idx = intel_get_event_idx,
1997};
1998 1204
1999static __initconst struct x86_pmu intel_pmu = { 1205static struct event_constraint *
2000 .name = "Intel", 1206x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
2001 .handle_irq = intel_pmu_handle_irq, 1207{
2002 .disable_all = intel_pmu_disable_all, 1208 struct event_constraint *c;
2003 .enable_all = intel_pmu_enable_all,
2004 .enable = intel_pmu_enable_event,
2005 .disable = intel_pmu_disable_event,
2006 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
2007 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
2008 .event_map = intel_pmu_event_map,
2009 .raw_event = intel_pmu_raw_event,
2010 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
2011 .apic = 1,
2012 /*
2013 * Intel PMCs cannot be accessed sanely above 32 bit width,
2014 * so we install an artificial 1<<31 period regardless of
2015 * the generic event period:
2016 */
2017 .max_period = (1ULL << 31) - 1,
2018 .enable_bts = intel_pmu_enable_bts,
2019 .disable_bts = intel_pmu_disable_bts,
2020 .get_event_idx = intel_get_event_idx,
2021};
2022 1209
2023static __initconst struct x86_pmu amd_pmu = { 1210 if (x86_pmu.event_constraints) {
2024 .name = "AMD", 1211 for_each_event_constraint(c, x86_pmu.event_constraints) {
2025 .handle_irq = amd_pmu_handle_irq, 1212 if ((event->hw.config & c->cmask) == c->code)
2026 .disable_all = amd_pmu_disable_all, 1213 return c;
2027 .enable_all = amd_pmu_enable_all, 1214 }
2028 .enable = amd_pmu_enable_event, 1215 }
2029 .disable = amd_pmu_disable_event, 1216
2030 .eventsel = MSR_K7_EVNTSEL0, 1217 return &unconstrained;
2031 .perfctr = MSR_K7_PERFCTR0, 1218}
2032 .event_map = amd_pmu_event_map,
2033 .raw_event = amd_pmu_raw_event,
2034 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
2035 .num_events = 4,
2036 .event_bits = 48,
2037 .event_mask = (1ULL << 48) - 1,
2038 .apic = 1,
2039 /* use highest bit to detect overflow */
2040 .max_period = (1ULL << 47) - 1,
2041 .get_event_idx = gen_get_event_idx,
2042};
2043 1219
2044static __init int p6_pmu_init(void) 1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
2045{ 1222{
2046 switch (boot_cpu_data.x86_model) { 1223 int ret = 0;
2047 case 1:
2048 case 3: /* Pentium Pro */
2049 case 5:
2050 case 6: /* Pentium II */
2051 case 7:
2052 case 8:
2053 case 11: /* Pentium III */
2054 event_constraints = intel_p6_event_constraints;
2055 break;
2056 case 9:
2057 case 13:
2058 /* Pentium M */
2059 event_constraints = intel_p6_event_constraints;
2060 break;
2061 default:
2062 pr_cont("unsupported p6 CPU model %d ",
2063 boot_cpu_data.x86_model);
2064 return -ENODEV;
2065 }
2066 1224
2067 x86_pmu = p6_pmu; 1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
2068 1228
2069 return 0; 1229 if (!is_x86_event(event))
1230 ret = event->pmu->enable(event);
1231
1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
2070} 1239}
2071 1240
2072static __init int intel_pmu_init(void) 1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
2073{ 1243{
2074 union cpuid10_edx edx; 1244 event->state = PERF_EVENT_STATE_INACTIVE;
2075 union cpuid10_eax eax; 1245 event->oncpu = -1;
2076 unsigned int unused;
2077 unsigned int ebx;
2078 int version;
2079
2080 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
2081 /* check for P6 processor family */
2082 if (boot_cpu_data.x86 == 6) {
2083 return p6_pmu_init();
2084 } else {
2085 return -ENODEV;
2086 }
2087 }
2088 1246
2089 /* 1247 if (!is_x86_event(event))
2090 * Check whether the Architectural PerfMon supports 1248 event->pmu->disable(event);
2091 * Branch Misses Retired hw_event or not.
2092 */
2093 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
2094 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
2095 return -ENODEV;
2096 1249
2097 version = eax.split.version_id; 1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
2098 if (version < 2)
2099 return -ENODEV;
2100 1251
2101 x86_pmu = intel_pmu; 1252 if (!is_software_event(event))
2102 x86_pmu.version = version; 1253 cpuctx->active_oncpu--;
2103 x86_pmu.num_events = eax.split.num_events;
2104 x86_pmu.event_bits = eax.split.bit_width;
2105 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
2106 1254
1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1256 cpuctx->exclusive = 0;
1257}
1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
2107 /* 1299 /*
2108 * Quirk: v2 perfmon does not report fixed-purpose events, so 1300 * copy new assignment, now we know it is possible
2109 * assume at least 3 events: 1301 * will be used by hw_perf_enable()
2110 */ 1302 */
2111 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
2112 1308
2113 /* 1309 /*
2114 * Install the hw-cache-events table: 1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
2115 */ 1315 */
2116 switch (boot_cpu_data.x86_model) { 1316 return 1;
2117 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1317undo:
2118 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1318 x86_event_sched_out(leader, cpuctx);
2119 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1319 n0 = 1;
2120 case 29: /* six-core 45 nm xeon "Dunnington" */ 1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2121 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
2122 sizeof(hw_cache_event_ids)); 1322 x86_event_sched_out(sub, cpuctx);
2123 1323 if (++n0 == n1)
2124 pr_cont("Core2 events, "); 1324 break;
2125 event_constraints = intel_core_event_constraints; 1325 }
2126 break; 1326 }
2127 default: 1327 return ret;
2128 case 26: 1328}
2129 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1329
2130 sizeof(hw_cache_event_ids)); 1330#include "perf_event_amd.c"
1331#include "perf_event_p6.c"
1332#include "perf_event_intel.c"
2131 1333
2132 event_constraints = intel_nehalem_event_constraints; 1334static int __cpuinit
2133 pr_cont("Nehalem/Corei7 events, "); 1335x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1336{
1337 unsigned int cpu = (long)hcpu;
1338 int ret = NOTIFY_OK;
1339
1340 switch (action & ~CPU_TASKS_FROZEN) {
1341 case CPU_UP_PREPARE:
1342 if (x86_pmu.cpu_prepare)
1343 ret = x86_pmu.cpu_prepare(cpu);
2134 break; 1344 break;
2135 case 28:
2136 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2137 sizeof(hw_cache_event_ids));
2138 1345
2139 pr_cont("Atom events, "); 1346 case CPU_STARTING:
1347 if (x86_pmu.cpu_starting)
1348 x86_pmu.cpu_starting(cpu);
2140 break; 1349 break;
2141 }
2142 return 0;
2143}
2144 1350
2145static __init int amd_pmu_init(void) 1351 case CPU_DYING:
2146{ 1352 if (x86_pmu.cpu_dying)
2147 /* Performance-monitoring supported from K7 and later: */ 1353 x86_pmu.cpu_dying(cpu);
2148 if (boot_cpu_data.x86 < 6) 1354 break;
2149 return -ENODEV;
2150 1355
2151 x86_pmu = amd_pmu; 1356 case CPU_UP_CANCELED:
1357 case CPU_DEAD:
1358 if (x86_pmu.cpu_dead)
1359 x86_pmu.cpu_dead(cpu);
1360 break;
2152 1361
2153 /* Events are common for all AMDs */ 1362 default:
2154 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 1363 break;
2155 sizeof(hw_cache_event_ids)); 1364 }
2156 1365
2157 return 0; 1366 return ret;
2158} 1367}
2159 1368
2160static void __init pmu_check_apic(void) 1369static void __init pmu_check_apic(void)
@@ -2169,6 +1378,7 @@ static void __init pmu_check_apic(void)
2169 1378
2170void __init init_hw_perf_events(void) 1379void __init init_hw_perf_events(void)
2171{ 1380{
1381 struct event_constraint *c;
2172 int err; 1382 int err;
2173 1383
2174 pr_info("Performance Events: "); 1384 pr_info("Performance Events: ");
@@ -2213,6 +1423,20 @@ void __init init_hw_perf_events(void)
2213 perf_events_lapic_init(); 1423 perf_events_lapic_init();
2214 register_die_notifier(&perf_event_nmi_notifier); 1424 register_die_notifier(&perf_event_nmi_notifier);
2215 1425
1426 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
1428 0, x86_pmu.num_events);
1429
1430 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK)
1433 continue;
1434
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
1436 c->weight += x86_pmu.num_events;
1437 }
1438 }
1439
2216 pr_info("... version: %d\n", x86_pmu.version); 1440 pr_info("... version: %d\n", x86_pmu.version);
2217 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1441 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2218 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1442 pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2220,60 +1444,91 @@ void __init init_hw_perf_events(void)
2220 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2221 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
2222 pr_info("... event mask: %016Lx\n", perf_event_mask); 1446 pr_info("... event mask: %016Lx\n", perf_event_mask);
1447
1448 perf_cpu_notifier(x86_pmu_notifier);
2223} 1449}
2224 1450
2225static inline void x86_pmu_read(struct perf_event *event) 1451static inline void x86_pmu_read(struct perf_event *event)
2226{ 1452{
2227 x86_perf_event_update(event, &event->hw, event->hw.idx); 1453 x86_perf_event_update(event);
2228} 1454}
2229 1455
2230static const struct pmu pmu = { 1456static const struct pmu pmu = {
2231 .enable = x86_pmu_enable, 1457 .enable = x86_pmu_enable,
2232 .disable = x86_pmu_disable, 1458 .disable = x86_pmu_disable,
1459 .start = x86_pmu_start,
1460 .stop = x86_pmu_stop,
2233 .read = x86_pmu_read, 1461 .read = x86_pmu_read,
2234 .unthrottle = x86_pmu_unthrottle, 1462 .unthrottle = x86_pmu_unthrottle,
2235}; 1463};
2236 1464
2237static int 1465/*
2238validate_event(struct cpu_hw_events *cpuc, struct perf_event *event) 1466 * validate a single event group
2239{ 1467 *
2240 struct hw_perf_event fake_event = event->hw; 1468 * validation include:
2241 1469 * - check events are compatible which each other
2242 if (event->pmu && event->pmu != &pmu) 1470 * - events do not compete for the same counter
2243 return 0; 1471 * - number of events <= number of counters
2244 1472 *
2245 return x86_schedule_event(cpuc, &fake_event) >= 0; 1473 * validation ensures the group can be loaded onto the
2246} 1474 * PMU if it was the only group available.
2247 1475 */
2248static int validate_group(struct perf_event *event) 1476static int validate_group(struct perf_event *event)
2249{ 1477{
2250 struct perf_event *sibling, *leader = event->group_leader; 1478 struct perf_event *leader = event->group_leader;
2251 struct cpu_hw_events fake_pmu; 1479 struct cpu_hw_events *fake_cpuc;
1480 int ret, n;
2252 1481
2253 memset(&fake_pmu, 0, sizeof(fake_pmu)); 1482 ret = -ENOMEM;
1483 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1484 if (!fake_cpuc)
1485 goto out;
2254 1486
2255 if (!validate_event(&fake_pmu, leader)) 1487 /*
2256 return -ENOSPC; 1488 * the event is not yet connected with its
1489 * siblings therefore we must first collect
1490 * existing siblings, then add the new event
1491 * before we can simulate the scheduling
1492 */
1493 ret = -ENOSPC;
1494 n = collect_events(fake_cpuc, leader, true);
1495 if (n < 0)
1496 goto out_free;
2257 1497
2258 list_for_each_entry(sibling, &leader->sibling_list, group_entry) { 1498 fake_cpuc->n_events = n;
2259 if (!validate_event(&fake_pmu, sibling)) 1499 n = collect_events(fake_cpuc, event, false);
2260 return -ENOSPC; 1500 if (n < 0)
2261 } 1501 goto out_free;
2262 1502
2263 if (!validate_event(&fake_pmu, event)) 1503 fake_cpuc->n_events = n;
2264 return -ENOSPC;
2265 1504
2266 return 0; 1505 ret = x86_schedule_events(fake_cpuc, n, NULL);
1506
1507out_free:
1508 kfree(fake_cpuc);
1509out:
1510 return ret;
2267} 1511}
2268 1512
2269const struct pmu *hw_perf_event_init(struct perf_event *event) 1513const struct pmu *hw_perf_event_init(struct perf_event *event)
2270{ 1514{
1515 const struct pmu *tmp;
2271 int err; 1516 int err;
2272 1517
2273 err = __hw_perf_event_init(event); 1518 err = __hw_perf_event_init(event);
2274 if (!err) { 1519 if (!err) {
1520 /*
1521 * we temporarily connect event to its pmu
1522 * such that validate_group() can classify
1523 * it as an x86 event using is_x86_event()
1524 */
1525 tmp = event->pmu;
1526 event->pmu = &pmu;
1527
2275 if (event->group_leader != event) 1528 if (event->group_leader != event)
2276 err = validate_group(event); 1529 err = validate_group(event);
1530
1531 event->pmu = tmp;
2277 } 1532 }
2278 if (err) { 1533 if (err) {
2279 if (event->destroy) 1534 if (event->destroy)
@@ -2297,7 +1552,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2297 1552
2298static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1553static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2299static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1554static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2300static DEFINE_PER_CPU(int, in_ignored_frame);
2301 1555
2302 1556
2303static void 1557static void
@@ -2313,10 +1567,6 @@ static void backtrace_warning(void *data, char *msg)
2313 1567
2314static int backtrace_stack(void *data, char *name) 1568static int backtrace_stack(void *data, char *name)
2315{ 1569{
2316 per_cpu(in_ignored_frame, smp_processor_id()) =
2317 x86_is_stack_id(NMI_STACK, name) ||
2318 x86_is_stack_id(DEBUG_STACK, name);
2319
2320 return 0; 1570 return 0;
2321} 1571}
2322 1572
@@ -2324,9 +1574,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2324{ 1574{
2325 struct perf_callchain_entry *entry = data; 1575 struct perf_callchain_entry *entry = data;
2326 1576
2327 if (per_cpu(in_ignored_frame, smp_processor_id()))
2328 return;
2329
2330 if (reliable) 1577 if (reliable)
2331 callchain_store(entry, addr); 1578 callchain_store(entry, addr);
2332} 1579}
@@ -2347,7 +1594,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2347 callchain_store(entry, PERF_CONTEXT_KERNEL); 1594 callchain_store(entry, PERF_CONTEXT_KERNEL);
2348 callchain_store(entry, regs->ip); 1595 callchain_store(entry, regs->ip);
2349 1596
2350 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2351} 1598}
2352 1599
2353/* 1600/*
@@ -2385,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
2385 return len; 1632 return len;
2386} 1633}
2387 1634
2388static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1635#ifdef CONFIG_COMPAT
1636static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2389{ 1638{
2390 unsigned long bytes; 1639 /* 32-bit process in 64-bit kernel. */
1640 struct stack_frame_ia32 frame;
1641 const void __user *fp;
1642
1643 if (!test_thread_flag(TIF_IA32))
1644 return 0;
1645
1646 fp = compat_ptr(regs->bp);
1647 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1648 unsigned long bytes;
1649 frame.next_frame = 0;
1650 frame.return_address = 0;
2391 1651
2392 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); 1652 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1653 if (bytes != sizeof(frame))
1654 break;
1655
1656 if (fp < compat_ptr(regs->sp))
1657 break;
2393 1658
2394 return bytes == sizeof(*frame); 1659 callchain_store(entry, frame.return_address);
1660 fp = compat_ptr(frame.next_frame);
1661 }
1662 return 1;
1663}
1664#else
1665static inline int
1666perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1667{
1668 return 0;
2395} 1669}
1670#endif
2396 1671
2397static void 1672static void
2398perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1673perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -2408,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
2408 callchain_store(entry, PERF_CONTEXT_USER); 1683 callchain_store(entry, PERF_CONTEXT_USER);
2409 callchain_store(entry, regs->ip); 1684 callchain_store(entry, regs->ip);
2410 1685
1686 if (perf_callchain_user32(regs, entry))
1687 return;
1688
2411 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1689 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1690 unsigned long bytes;
2412 frame.next_frame = NULL; 1691 frame.next_frame = NULL;
2413 frame.return_address = 0; 1692 frame.return_address = 0;
2414 1693
2415 if (!copy_stack_frame(fp, &frame)) 1694 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1695 if (bytes != sizeof(frame))
2416 break; 1696 break;
2417 1697
2418 if ((unsigned long)fp < regs->sp) 1698 if ((unsigned long)fp < regs->sp)
@@ -2433,9 +1713,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
2433 1713
2434 is_user = user_mode(regs); 1714 is_user = user_mode(regs);
2435 1715
2436 if (!current || current->pid == 0)
2437 return;
2438
2439 if (is_user && current->state != TASK_RUNNING) 1716 if (is_user && current->state != TASK_RUNNING)
2440 return; 1717 return;
2441 1718
@@ -2462,7 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2462 return entry; 1739 return entry;
2463} 1740}
2464 1741
2465void hw_perf_event_setup_online(int cpu) 1742void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2466{ 1743{
2467 init_debug_store_on_cpu(cpu); 1744 regs->ip = ip;
1745 /*
1746 * perf_arch_fetch_caller_regs adds another call, we need to increment
1747 * the skip level
1748 */
1749 regs->bp = rewind_frame_pointer(skip + 1);
1750 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags);
2468} 1752}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..db6f7d4056e1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,422 @@
1#ifdef CONFIG_CPU_SUP_AMD
2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
9{
10 [ C(L1D) ] = {
11 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
14 },
15 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
17 [ C(RESULT_MISS) ] = 0,
18 },
19 [ C(OP_PREFETCH) ] = {
20 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
21 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
22 },
23 },
24 [ C(L1I ) ] = {
25 [ C(OP_READ) ] = {
26 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
27 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
28 },
29 [ C(OP_WRITE) ] = {
30 [ C(RESULT_ACCESS) ] = -1,
31 [ C(RESULT_MISS) ] = -1,
32 },
33 [ C(OP_PREFETCH) ] = {
34 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
35 [ C(RESULT_MISS) ] = 0,
36 },
37 },
38 [ C(LL ) ] = {
39 [ C(OP_READ) ] = {
40 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
41 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
42 },
43 [ C(OP_WRITE) ] = {
44 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
45 [ C(RESULT_MISS) ] = 0,
46 },
47 [ C(OP_PREFETCH) ] = {
48 [ C(RESULT_ACCESS) ] = 0,
49 [ C(RESULT_MISS) ] = 0,
50 },
51 },
52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
56 },
57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0,
60 },
61 [ C(OP_PREFETCH) ] = {
62 [ C(RESULT_ACCESS) ] = 0,
63 [ C(RESULT_MISS) ] = 0,
64 },
65 },
66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
70 },
71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1,
73 [ C(RESULT_MISS) ] = -1,
74 },
75 [ C(OP_PREFETCH) ] = {
76 [ C(RESULT_ACCESS) ] = -1,
77 [ C(RESULT_MISS) ] = -1,
78 },
79 },
80 [ C(BPU ) ] = {
81 [ C(OP_READ) ] = {
82 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
83 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
84 },
85 [ C(OP_WRITE) ] = {
86 [ C(RESULT_ACCESS) ] = -1,
87 [ C(RESULT_MISS) ] = -1,
88 },
89 [ C(OP_PREFETCH) ] = {
90 [ C(RESULT_ACCESS) ] = -1,
91 [ C(RESULT_MISS) ] = -1,
92 },
93 },
94};
95
96/*
97 * AMD Performance Monitor K7 and later.
98 */
99static const u64 amd_perfmon_event_map[] =
100{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
107};
108
109static u64 amd_pmu_event_map(int hw_event)
110{
111 return amd_perfmon_event_map[hw_event];
112}
113
114static u64 amd_pmu_raw_event(u64 hw_event)
115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
121
122#define K7_EVNTSEL_MASK \
123 (K7_EVNTSEL_EVENT_MASK | \
124 K7_EVNTSEL_UNIT_MASK | \
125 K7_EVNTSEL_EDGE_MASK | \
126 K7_EVNTSEL_INV_MASK | \
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * AMD64 events are detected based on their event codes.
134 */
135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
136{
137 return (hwc->config & 0xe0) == 0xe0;
138}
139
140static inline int amd_has_nb(struct cpu_hw_events *cpuc)
141{
142 struct amd_nb *nb = cpuc->amd_nb;
143
144 return nb && nb->nb_id != -1;
145}
146
147static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
148 struct perf_event *event)
149{
150 struct hw_perf_event *hwc = &event->hw;
151 struct amd_nb *nb = cpuc->amd_nb;
152 int i;
153
154 /*
155 * only care about NB events
156 */
157 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
158 return;
159
160 /*
161 * need to scan whole list because event may not have
162 * been assigned during scheduling
163 *
164 * no race condition possible because event can only
165 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here
167 */
168 for (i = 0; i < x86_pmu.num_events; i++) {
169 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL);
171 break;
172 }
173 }
174}
175
176 /*
177 * AMD64 NorthBridge events need special treatment because
178 * counter access needs to be synchronized across all cores
179 * of a package. Refer to BKDG section 3.12
180 *
181 * NB events are events measuring L3 cache, Hypertransport
182 * traffic. They are identified by an event code >= 0xe00.
183 * They measure events on the NorthBride which is shared
184 * by all cores on a package. NB events are counted on a
185 * shared set of counters. When a NB event is programmed
186 * in a counter, the data actually comes from a shared
187 * counter. Thus, access to those counters needs to be
188 * synchronized.
189 *
190 * We implement the synchronization such that no two cores
191 * can be measuring NB events using the same counters. Thus,
192 * we maintain a per-NB allocation table. The available slot
193 * is propagated using the event_constraint structure.
194 *
195 * We provide only one choice for each NB event based on
196 * the fact that only NB events have restrictions. Consequently,
197 * if a counter is available, there is a guarantee the NB event
198 * will be assigned to it. If no slot is available, an empty
199 * constraint is returned and scheduling will eventually fail
200 * for this event.
201 *
202 * Note that all cores attached the same NB compete for the same
203 * counters to host NB events, this is why we use atomic ops. Some
204 * multi-chip CPUs may have more than one NB.
205 *
206 * Given that resources are allocated (cmpxchg), they must be
207 * eventually freed for others to use. This is accomplished by
208 * calling amd_put_event_constraints().
209 *
210 * Non NB events are not impacted by this restriction.
211 */
212static struct event_constraint *
213amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
214{
215 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events;
219 int i, j, k = -1;
220
221 /*
222 * if not NB event or no NB, then no constraints
223 */
224 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
225 return &unconstrained;
226
227 /*
228 * detect if already present, if so reuse
229 *
230 * cannot merge with actual allocation
231 * because of possible holes
232 *
233 * event can already be present yet not assigned (in hwc->idx)
234 * because of successive calls to x86_schedule_events() from
235 * hw_perf_group_sched_in() without hw_perf_enable()
236 */
237 for (i = 0; i < max; i++) {
238 /*
239 * keep track of first free slot
240 */
241 if (k == -1 && !nb->owners[i])
242 k = i;
243
244 /* already present, reuse */
245 if (nb->owners[i] == event)
246 goto done;
247 }
248 /*
249 * not present, so grab a new slot
250 * starting either at:
251 */
252 if (hwc->idx != -1) {
253 /* previous assignment */
254 i = hwc->idx;
255 } else if (k != -1) {
256 /* start from free slot found */
257 i = k;
258 } else {
259 /*
260 * event not found, no slot found in
261 * first pass, try again from the
262 * beginning
263 */
264 i = 0;
265 }
266 j = i;
267 do {
268 old = cmpxchg(nb->owners+i, NULL, event);
269 if (!old)
270 break;
271 if (++i == max)
272 i = 0;
273 } while (i != j);
274done:
275 if (!old)
276 return &nb->event_constraints[i];
277
278 return &emptyconstraint;
279}
280
281static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
282{
283 struct amd_nb *nb;
284 int i;
285
286 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
287 if (!nb)
288 return NULL;
289
290 memset(nb, 0, sizeof(*nb));
291 nb->nb_id = nb_id;
292
293 /*
294 * initialize all possible NB constraints
295 */
296 for (i = 0; i < x86_pmu.num_events; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1;
299 }
300 return nb;
301}
302
303static int amd_pmu_cpu_prepare(int cpu)
304{
305 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
306
307 WARN_ON_ONCE(cpuc->amd_nb);
308
309 if (boot_cpu_data.x86_max_cores < 2)
310 return NOTIFY_OK;
311
312 cpuc->amd_nb = amd_alloc_nb(cpu, -1);
313 if (!cpuc->amd_nb)
314 return NOTIFY_BAD;
315
316 return NOTIFY_OK;
317}
318
319static void amd_pmu_cpu_starting(int cpu)
320{
321 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
322 struct amd_nb *nb;
323 int i, nb_id;
324
325 if (boot_cpu_data.x86_max_cores < 2)
326 return;
327
328 nb_id = amd_get_nb_id(cpu);
329 WARN_ON_ONCE(nb_id == BAD_APICID);
330
331 raw_spin_lock(&amd_nb_lock);
332
333 for_each_online_cpu(i) {
334 nb = per_cpu(cpu_hw_events, i).amd_nb;
335 if (WARN_ON_ONCE(!nb))
336 continue;
337
338 if (nb->nb_id == nb_id) {
339 kfree(cpuc->amd_nb);
340 cpuc->amd_nb = nb;
341 break;
342 }
343 }
344
345 cpuc->amd_nb->nb_id = nb_id;
346 cpuc->amd_nb->refcnt++;
347
348 raw_spin_unlock(&amd_nb_lock);
349}
350
351static void amd_pmu_cpu_dead(int cpu)
352{
353 struct cpu_hw_events *cpuhw;
354
355 if (boot_cpu_data.x86_max_cores < 2)
356 return;
357
358 cpuhw = &per_cpu(cpu_hw_events, cpu);
359
360 raw_spin_lock(&amd_nb_lock);
361
362 if (cpuhw->amd_nb) {
363 struct amd_nb *nb = cpuhw->amd_nb;
364
365 if (nb->nb_id == -1 || --nb->refcnt == 0)
366 kfree(nb);
367
368 cpuhw->amd_nb = NULL;
369 }
370
371 raw_spin_unlock(&amd_nb_lock);
372}
373
374static __initconst struct x86_pmu amd_pmu = {
375 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event,
381 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4,
387 .event_bits = 48,
388 .event_mask = (1ULL << 48) - 1,
389 .apic = 1,
390 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1,
392 .get_event_constraints = amd_get_event_constraints,
393 .put_event_constraints = amd_put_event_constraints,
394
395 .cpu_prepare = amd_pmu_cpu_prepare,
396 .cpu_starting = amd_pmu_cpu_starting,
397 .cpu_dead = amd_pmu_cpu_dead,
398};
399
400static __init int amd_pmu_init(void)
401{
402 /* Performance-monitoring supported from K7 and later: */
403 if (boot_cpu_data.x86 < 6)
404 return -ENODEV;
405
406 x86_pmu = amd_pmu;
407
408 /* Events are common for all AMDs */
409 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
410 sizeof(hw_cache_event_ids));
411
412 return 0;
413}
414
415#else /* CONFIG_CPU_SUP_AMD */
416
417static int amd_pmu_init(void)
418{
419 return 0;
420}
421
422#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..9c794ac87837
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,980 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Intel PerfMon, used on Core and later.
5 */
6static const u64 intel_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15};
16
17static struct event_constraint intel_core_event_constraints[] =
18{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
21 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
22 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
23 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
24 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
25 EVENT_CONSTRAINT_END
26};
27
28static struct event_constraint intel_core2_event_constraints[] =
29{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
32 /*
33 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
34 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
35 * ratio between these counters.
36 */
37 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
38 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
39 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
40 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
41 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
42 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
43 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
44 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
45 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
46 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
47 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
48 EVENT_CONSTRAINT_END
49};
50
51static struct event_constraint intel_nehalem_event_constraints[] =
52{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
55 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
56 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
57 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
58 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
59 INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
60 INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
61 INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
62 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
63 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
64 EVENT_CONSTRAINT_END
65};
66
67static struct event_constraint intel_westmere_event_constraints[] =
68{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 EVENT_CONSTRAINT_END
76};
77
78static struct event_constraint intel_gen_event_constraints[] =
79{
80 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
81 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
82 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
83 EVENT_CONSTRAINT_END
84};
85
86static u64 intel_pmu_event_map(int hw_event)
87{
88 return intel_perfmon_event_map[hw_event];
89}
90
91static __initconst u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
95{
96 [ C(L1D) ] = {
97 [ C(OP_READ) ] = {
98 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
99 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
100 },
101 [ C(OP_WRITE) ] = {
102 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
103 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
104 },
105 [ C(OP_PREFETCH) ] = {
106 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
107 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
108 },
109 },
110 [ C(L1I ) ] = {
111 [ C(OP_READ) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
113 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
114 },
115 [ C(OP_WRITE) ] = {
116 [ C(RESULT_ACCESS) ] = -1,
117 [ C(RESULT_MISS) ] = -1,
118 },
119 [ C(OP_PREFETCH) ] = {
120 [ C(RESULT_ACCESS) ] = 0x0,
121 [ C(RESULT_MISS) ] = 0x0,
122 },
123 },
124 [ C(LL ) ] = {
125 [ C(OP_READ) ] = {
126 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
127 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
128 },
129 [ C(OP_WRITE) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
131 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
132 },
133 [ C(OP_PREFETCH) ] = {
134 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
135 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
136 },
137 },
138 [ C(DTLB) ] = {
139 [ C(OP_READ) ] = {
140 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
141 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
142 },
143 [ C(OP_WRITE) ] = {
144 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
145 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
146 },
147 [ C(OP_PREFETCH) ] = {
148 [ C(RESULT_ACCESS) ] = 0x0,
149 [ C(RESULT_MISS) ] = 0x0,
150 },
151 },
152 [ C(ITLB) ] = {
153 [ C(OP_READ) ] = {
154 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
155 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
156 },
157 [ C(OP_WRITE) ] = {
158 [ C(RESULT_ACCESS) ] = -1,
159 [ C(RESULT_MISS) ] = -1,
160 },
161 [ C(OP_PREFETCH) ] = {
162 [ C(RESULT_ACCESS) ] = -1,
163 [ C(RESULT_MISS) ] = -1,
164 },
165 },
166 [ C(BPU ) ] = {
167 [ C(OP_READ) ] = {
168 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
169 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
170 },
171 [ C(OP_WRITE) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 [ C(OP_PREFETCH) ] = {
176 [ C(RESULT_ACCESS) ] = -1,
177 [ C(RESULT_MISS) ] = -1,
178 },
179 },
180};
181
182static __initconst u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
186{
187 [ C(L1D) ] = {
188 [ C(OP_READ) ] = {
189 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
190 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
191 },
192 [ C(OP_WRITE) ] = {
193 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
194 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
195 },
196 [ C(OP_PREFETCH) ] = {
197 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
198 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
199 },
200 },
201 [ C(L1I ) ] = {
202 [ C(OP_READ) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
204 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
205 },
206 [ C(OP_WRITE) ] = {
207 [ C(RESULT_ACCESS) ] = -1,
208 [ C(RESULT_MISS) ] = -1,
209 },
210 [ C(OP_PREFETCH) ] = {
211 [ C(RESULT_ACCESS) ] = 0x0,
212 [ C(RESULT_MISS) ] = 0x0,
213 },
214 },
215 [ C(LL ) ] = {
216 [ C(OP_READ) ] = {
217 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
218 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
219 },
220 [ C(OP_WRITE) ] = {
221 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
222 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
223 },
224 [ C(OP_PREFETCH) ] = {
225 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
226 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
227 },
228 },
229 [ C(DTLB) ] = {
230 [ C(OP_READ) ] = {
231 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
232 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
233 },
234 [ C(OP_WRITE) ] = {
235 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
236 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
237 },
238 [ C(OP_PREFETCH) ] = {
239 [ C(RESULT_ACCESS) ] = 0x0,
240 [ C(RESULT_MISS) ] = 0x0,
241 },
242 },
243 [ C(ITLB) ] = {
244 [ C(OP_READ) ] = {
245 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
246 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
247 },
248 [ C(OP_WRITE) ] = {
249 [ C(RESULT_ACCESS) ] = -1,
250 [ C(RESULT_MISS) ] = -1,
251 },
252 [ C(OP_PREFETCH) ] = {
253 [ C(RESULT_ACCESS) ] = -1,
254 [ C(RESULT_MISS) ] = -1,
255 },
256 },
257 [ C(BPU ) ] = {
258 [ C(OP_READ) ] = {
259 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
260 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
261 },
262 [ C(OP_WRITE) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 [ C(OP_PREFETCH) ] = {
267 [ C(RESULT_ACCESS) ] = -1,
268 [ C(RESULT_MISS) ] = -1,
269 },
270 },
271};
272
273static __initconst u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
277{
278 [ C(L1D) ] = {
279 [ C(OP_READ) ] = {
280 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
281 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
282 },
283 [ C(OP_WRITE) ] = {
284 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
285 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
286 },
287 [ C(OP_PREFETCH) ] = {
288 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
289 [ C(RESULT_MISS) ] = 0,
290 },
291 },
292 [ C(L1I ) ] = {
293 [ C(OP_READ) ] = {
294 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
295 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
296 },
297 [ C(OP_WRITE) ] = {
298 [ C(RESULT_ACCESS) ] = -1,
299 [ C(RESULT_MISS) ] = -1,
300 },
301 [ C(OP_PREFETCH) ] = {
302 [ C(RESULT_ACCESS) ] = 0,
303 [ C(RESULT_MISS) ] = 0,
304 },
305 },
306 [ C(LL ) ] = {
307 [ C(OP_READ) ] = {
308 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
309 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
310 },
311 [ C(OP_WRITE) ] = {
312 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
313 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
314 },
315 [ C(OP_PREFETCH) ] = {
316 [ C(RESULT_ACCESS) ] = 0,
317 [ C(RESULT_MISS) ] = 0,
318 },
319 },
320 [ C(DTLB) ] = {
321 [ C(OP_READ) ] = {
322 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
323 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
324 },
325 [ C(OP_WRITE) ] = {
326 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
327 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
328 },
329 [ C(OP_PREFETCH) ] = {
330 [ C(RESULT_ACCESS) ] = 0,
331 [ C(RESULT_MISS) ] = 0,
332 },
333 },
334 [ C(ITLB) ] = {
335 [ C(OP_READ) ] = {
336 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
337 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
338 },
339 [ C(OP_WRITE) ] = {
340 [ C(RESULT_ACCESS) ] = -1,
341 [ C(RESULT_MISS) ] = -1,
342 },
343 [ C(OP_PREFETCH) ] = {
344 [ C(RESULT_ACCESS) ] = -1,
345 [ C(RESULT_MISS) ] = -1,
346 },
347 },
348 [ C(BPU ) ] = {
349 [ C(OP_READ) ] = {
350 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
351 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
352 },
353 [ C(OP_WRITE) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 [ C(OP_PREFETCH) ] = {
358 [ C(RESULT_ACCESS) ] = -1,
359 [ C(RESULT_MISS) ] = -1,
360 },
361 },
362};
363
364static __initconst u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
368{
369 [ C(L1D) ] = {
370 [ C(OP_READ) ] = {
371 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
372 [ C(RESULT_MISS) ] = 0,
373 },
374 [ C(OP_WRITE) ] = {
375 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
376 [ C(RESULT_MISS) ] = 0,
377 },
378 [ C(OP_PREFETCH) ] = {
379 [ C(RESULT_ACCESS) ] = 0x0,
380 [ C(RESULT_MISS) ] = 0,
381 },
382 },
383 [ C(L1I ) ] = {
384 [ C(OP_READ) ] = {
385 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
386 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
387 },
388 [ C(OP_WRITE) ] = {
389 [ C(RESULT_ACCESS) ] = -1,
390 [ C(RESULT_MISS) ] = -1,
391 },
392 [ C(OP_PREFETCH) ] = {
393 [ C(RESULT_ACCESS) ] = 0,
394 [ C(RESULT_MISS) ] = 0,
395 },
396 },
397 [ C(LL ) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
400 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
404 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(DTLB) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
414 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
418 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(ITLB) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
428 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = -1,
432 [ C(RESULT_MISS) ] = -1,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = -1,
436 [ C(RESULT_MISS) ] = -1,
437 },
438 },
439 [ C(BPU ) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
442 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = -1,
446 [ C(RESULT_MISS) ] = -1,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = -1,
450 [ C(RESULT_MISS) ] = -1,
451 },
452 },
453};
454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void)
510{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
512
513 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
514
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts();
517}
518
519static void intel_pmu_enable_all(void)
520{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
526 struct perf_event *event =
527 cpuc->events[X86_PMC_IDX_FIXED_BTS];
528
529 if (WARN_ON_ONCE(!event))
530 return;
531
532 intel_pmu_enable_bts(event->hw.config);
533 }
534}
535
536static inline u64 intel_pmu_get_status(void)
537{
538 u64 status;
539
540 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
541
542 return status;
543}
544
545static inline void intel_pmu_ack_status(u64 ack)
546{
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548}
549
550static inline void
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask;
555
556 mask = 0xfULL << (idx * 4);
557
558 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621}
622
623static inline void
624intel_pmu_disable_event(struct perf_event *event)
625{
626 struct hw_perf_event *hwc = &event->hw;
627
628 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
629 intel_pmu_disable_bts();
630 intel_pmu_drain_bts_buffer();
631 return;
632 }
633
634 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
635 intel_pmu_disable_fixed(hwc);
636 return;
637 }
638
639 x86_pmu_disable_event(event);
640}
641
642static inline void
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask;
647 int err;
648
649 /*
650 * Enable IRQ generation (0x8),
651 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
652 * if requested:
653 */
654 bits = 0x8ULL;
655 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
656 bits |= 0x2;
657 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
658 bits |= 0x1;
659
660 /*
661 * ANY bit is supported in v3 and up
662 */
663 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
664 bits |= 0x4;
665
666 bits <<= (idx * 4);
667 mask = 0xfULL << (idx * 4);
668
669 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask;
671 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val);
673}
674
675static void intel_pmu_enable_event(struct perf_event *event)
676{
677 struct hw_perf_event *hwc = &event->hw;
678
679 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
680 if (!__get_cpu_var(cpu_hw_events).enabled)
681 return;
682
683 intel_pmu_enable_bts(hwc->config);
684 return;
685 }
686
687 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
688 intel_pmu_enable_fixed(hwc);
689 return;
690 }
691
692 __x86_pmu_enable_event(hwc);
693}
694
695/*
696 * Save and restart an expired event. Called by NMI contexts,
697 * so it has to be careful about preempting normal event ops:
698 */
699static int intel_pmu_save_and_restart(struct perf_event *event)
700{
701 x86_perf_event_update(event);
702 return x86_perf_event_set_period(event);
703}
704
705static void intel_pmu_reset(void)
706{
707 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
708 unsigned long flags;
709 int idx;
710
711 if (!x86_pmu.num_events)
712 return;
713
714 local_irq_save(flags);
715
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717
718 for (idx = 0; idx < x86_pmu.num_events; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 }
725 if (ds)
726 ds->bts_index = ds->bts_buffer_base;
727
728 local_irq_restore(flags);
729}
730
731/*
732 * This handler is triggered by the local APIC, so the APIC IRQ handling
733 * rules apply:
734 */
735static int intel_pmu_handle_irq(struct pt_regs *regs)
736{
737 struct perf_sample_data data;
738 struct cpu_hw_events *cpuc;
739 int bit, loops;
740 u64 ack, status;
741
742 perf_sample_data_init(&data, 0);
743
744 cpuc = &__get_cpu_var(cpu_hw_events);
745
746 intel_pmu_disable_all();
747 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status();
749 if (!status) {
750 intel_pmu_enable_all();
751 return 0;
752 }
753
754 loops = 0;
755again:
756 if (++loops > 100) {
757 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
758 perf_event_print_debug();
759 intel_pmu_reset();
760 goto done;
761 }
762
763 inc_irq_stat(apic_perf_irqs);
764 ack = status;
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit];
767
768 if (!test_bit(bit, cpuc->active_mask))
769 continue;
770
771 if (!intel_pmu_save_and_restart(event))
772 continue;
773
774 data.period = event->hw.last_period;
775
776 if (perf_event_overflow(event, 1, &data, regs))
777 x86_pmu_stop(event);
778 }
779
780 intel_pmu_ack_status(ack);
781
782 /*
783 * Repeat if there is more work to be done:
784 */
785 status = intel_pmu_get_status();
786 if (status)
787 goto again;
788
789done:
790 intel_pmu_enable_all();
791 return 1;
792}
793
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint *
798intel_special_constraints(struct perf_event *event)
799{
800 unsigned int hw_event;
801
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803
804 if (unlikely((hw_event ==
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
806 (event->hw.sample_period == 1))) {
807
808 return &bts_constraint;
809 }
810 return NULL;
811}
812
813static struct event_constraint *
814intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
815{
816 struct event_constraint *c;
817
818 c = intel_special_constraints(event);
819 if (c)
820 return c;
821
822 return x86_get_event_constraints(cpuc, event);
823}
824
825static __initconst struct x86_pmu core_pmu = {
826 .name = "core",
827 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1,
838 /*
839 * Intel PMCs cannot be accessed sanely above 32 bit width,
840 * so we install an artificial 1<<31 period regardless of
841 * the generic event period:
842 */
843 .max_period = (1ULL << 31) - 1,
844 .get_event_constraints = intel_get_event_constraints,
845 .event_constraints = intel_core_event_constraints,
846};
847
848static __initconst struct x86_pmu intel_pmu = {
849 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1,
861 /*
862 * Intel PMCs cannot be accessed sanely above 32 bit width,
863 * so we install an artificial 1<<31 period regardless of
864 * the generic event period:
865 */
866 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints,
870
871 .cpu_starting = init_debug_store_on_cpu,
872 .cpu_dying = fini_debug_store_on_cpu,
873};
874
875static __init int intel_pmu_init(void)
876{
877 union cpuid10_edx edx;
878 union cpuid10_eax eax;
879 unsigned int unused;
880 unsigned int ebx;
881 int version;
882
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */
885 if (boot_cpu_data.x86 == 6) {
886 return p6_pmu_init();
887 } else {
888 return -ENODEV;
889 }
890 }
891
892 /*
893 * Check whether the Architectural PerfMon supports
894 * Branch Misses Retired hw_event or not.
895 */
896 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
897 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
898 return -ENODEV;
899
900 version = eax.split.version_id;
901 if (version < 2)
902 x86_pmu = core_pmu;
903 else
904 x86_pmu = intel_pmu;
905
906 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events;
908 x86_pmu.event_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
910
911 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events:
914 */
915 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
917
918 /*
919 * Install the hw-cache-events table:
920 */
921 switch (boot_cpu_data.x86_model) {
922 case 14: /* 65 nm core solo/duo, "Yonah" */
923 pr_cont("Core events, ");
924 break;
925
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids));
932
933 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, ");
935 break;
936
937 case 26: /* 45 nm nehalem, "Bloomfield" */
938 case 30: /* 45 nm nehalem, "Lynnfield" */
939 case 46: /* 45 nm nehalem-ex, "Beckton" */
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids));
942
943 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, ");
945 break;
946 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids));
949
950 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, ");
952 break;
953
954 case 37: /* 32 nm nehalem, "Clarkdale" */
955 case 44: /* 32 nm nehalem, "Gulftown" */
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids));
958
959 x86_pmu.event_constraints = intel_westmere_event_constraints;
960 pr_cont("Westmere events, ");
961 break;
962
963 default:
964 /*
965 * default constraints for v2 and up
966 */
967 x86_pmu.event_constraints = intel_gen_event_constraints;
968 pr_cont("generic architected perfmon, ");
969 }
970 return 0;
971}
972
973#else /* CONFIG_CPU_SUP_INTEL */
974
975static int intel_pmu_init(void)
976{
977 return 0;
978}
979
980#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..a330485d14da
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,159 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Not sure about some of these
5 */
6static const u64 p6_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
15};
16
17static u64 p6_pmu_event_map(int hw_event)
18{
19 return p6_perfmon_event_map[hw_event];
20}
21
22/*
23 * Event setting that is specified not to count anything.
24 * We use this to effectively disable a counter.
25 *
26 * L2_RQSTS with 0 MESI unit mask.
27 */
28#define P6_NOP_EVENT 0x0000002EULL
29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] =
49{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
51 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
52 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
53 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
54 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
55 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
56 EVENT_CONSTRAINT_END
57};
58
59static void p6_pmu_disable_all(void)
60{
61 u64 val;
62
63 /* p6 only has one enable register */
64 rdmsrl(MSR_P6_EVNTSEL0, val);
65 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
66 wrmsrl(MSR_P6_EVNTSEL0, val);
67}
68
69static void p6_pmu_enable_all(void)
70{
71 unsigned long val;
72
73 /* p6 only has one enable register */
74 rdmsrl(MSR_P6_EVNTSEL0, val);
75 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
76 wrmsrl(MSR_P6_EVNTSEL0, val);
77}
78
79static inline void
80p6_pmu_disable_event(struct perf_event *event)
81{
82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
83 struct hw_perf_event *hwc = &event->hw;
84 u64 val = P6_NOP_EVENT;
85
86 if (cpuc->enabled)
87 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
88
89 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
90}
91
92static void p6_pmu_enable_event(struct perf_event *event)
93{
94 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
95 struct hw_perf_event *hwc = &event->hw;
96 u64 val;
97
98 val = hwc->config;
99 if (cpuc->enabled)
100 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
101
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103}
104
105static __initconst struct x86_pmu p6_pmu = {
106 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event,
112 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1,
118 .max_period = (1ULL << 31) - 1,
119 .version = 0,
120 .num_events = 2,
121 /*
122 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the
124 * effective width of a event for P6-like PMU is 32 bits only.
125 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */
128 .event_bits = 32,
129 .event_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints,
132};
133
134static __init int p6_pmu_init(void)
135{
136 switch (boot_cpu_data.x86_model) {
137 case 1:
138 case 3: /* Pentium Pro */
139 case 5:
140 case 6: /* Pentium II */
141 case 7:
142 case 8:
143 case 11: /* Pentium III */
144 case 9:
145 case 13:
146 /* Pentium M */
147 break;
148 default:
149 pr_cont("unsupported p6 CPU model %d ",
150 boot_cpu_data.x86_model);
151 return -ENODEV;
152 }
153
154 x86_pmu = p6_pmu;
155
156 return 0;
157}
158
159#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 898df9719afb..fb329e9f8494 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
115 115
116 return !test_bit(counter, perfctr_nmi_owner); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118
119/* checks the an msr for availability */
120int avail_to_resrv_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 return !test_bit(counter, perfctr_nmi_owner);
128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 118EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 119
131int reserve_perfctr_nmi(unsigned int msr) 120int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
691 cpu_nmi_set_wd_enabled(); 680 cpu_nmi_set_wd_enabled();
692 681
693 apic_write(APIC_LVTPC, APIC_DM_NMI); 682 apic_write(APIC_LVTPC, APIC_DM_NMI);
694 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
695 wrmsr(evntsel_msr, evntsel, 0); 684 wrmsr(evntsel_msr, evntsel, 0);
696 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
697 return 1; 686 return 1;
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59cf..dfdb4dba2320 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include <asm/vmware.h> 27#include <asm/vmware.h>
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)
101 102
102 return 0; 103 return 0;
103} 104}
105EXPORT_SYMBOL(vmware_platform);
104 106
105/* 107/*
106 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 108 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index cb27fd6136c9..8b862d5900fe 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/uaccess.h> 42#include <linux/uaccess.h>
43#include <linux/gfp.h>
43 44
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/msr.h> 46#include <asm/msr.h>
@@ -229,7 +230,7 @@ static void __exit cpuid_exit(void)
229 for_each_online_cpu(cpu) 230 for_each_online_cpu(cpu)
230 cpuid_device_destroy(cpu); 231 cpuid_device_destroy(cpu);
231 class_destroy(cpuid_class); 232 class_destroy(cpuid_class);
232 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 233 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
233 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 234 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
234} 235}
235 236
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a4849c10a77e..ebd4c51d096a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,7 +27,6 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/x86_init.h>
31 30
32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
33 32
@@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
103#ifdef CONFIG_HPET_TIMER 102#ifdef CONFIG_HPET_TIMER
104 hpet_disable(); 103 hpet_disable();
105#endif 104#endif
106
107#ifdef CONFIG_X86_64
108 x86_platform.iommu_shutdown();
109#endif
110
111 crash_save_cpu(regs, safe_smp_processor_id()); 105 crash_save_cpu(regs, safe_smp_processor_id());
112} 106}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index cd97ce18c29d..67414550c3cc 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -5,6 +5,7 @@
5 * Copyright (C) IBM Corporation, 2004. All rights reserved 5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */ 6 */
7 7
8#include <linux/slab.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/highmem.h> 10#include <linux/highmem.h>
10#include <linux/crash_dump.h> 11#include <linux/crash_dump.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c56bc2873030..6d817554780a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -123,13 +123,15 @@ print_context_stack_bp(struct thread_info *tinfo,
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { 123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr; 124 unsigned long addr = *ret_addr;
125 125
126 if (__kernel_text_address(addr)) { 126 if (!__kernel_text_address(addr))
127 ops->address(data, addr, 1); 127 break;
128 frame = frame->next_frame; 128
129 ret_addr = &frame->return_address; 129 ops->address(data, addr, 1);
130 print_ftrace_graph_addr(addr, data, ops, tinfo, graph); 130 frame = frame->next_frame;
131 } 131 ret_addr = &frame->return_address;
132 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
132 } 133 }
134
133 return (unsigned long)frame; 135 return (unsigned long)frame;
134} 136}
135EXPORT_SYMBOL_GPL(print_context_stack_bp); 137EXPORT_SYMBOL_GPL(print_context_stack_bp);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 4fd1420faffa..e1a93be4fd44 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,6 +14,8 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17#include <linux/uaccess.h>
18
17extern void 19extern void
18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
19 unsigned long *stack, unsigned long bp, char *log_lvl); 21 unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -29,4 +31,26 @@ struct stack_frame {
29 struct stack_frame *next_frame; 31 struct stack_frame *next_frame;
30 unsigned long return_address; 32 unsigned long return_address;
31}; 33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
32#endif 51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index ae775ca47b25..11540a189d93 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -18,11 +18,6 @@
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21/* Just a stub for now */
22int x86_is_stack_id(int id, char *name)
23{
24 return 0;
25}
26 21
27void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
28 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 0ad9597073f5..272c9f1f05f3 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -33,11 +33,6 @@ static char x86_stack_ids[][8] = {
33#endif 33#endif
34}; 34};
35 35
36int x86_is_stack_id(int id, char *name)
37{
38 return x86_stack_ids[id - 1] == name;
39}
40
41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 36static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
42 unsigned *usedp, char **idp) 37 unsigned *usedp, char **idp)
43{ 38{
@@ -125,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
125{ 120{
126#ifdef CONFIG_FRAME_POINTER 121#ifdef CONFIG_FRAME_POINTER
127 struct stack_frame *frame = (struct stack_frame *)bp; 122 struct stack_frame *frame = (struct stack_frame *)bp;
123 unsigned long next;
128 124
129 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) 125 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
130 return (unsigned long)frame->next_frame; 126 if (!probe_kernel_address(&frame->next_frame, next))
127 return next;
128 else
129 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
130 "callchain\n", &frame->next_frame);
131 }
131#endif 132#endif
132 return bp; 133 return bp;
133} 134}
@@ -207,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
207 if (in_irq_stack(stack, irq_stack, irq_stack_end)) { 208 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
208 if (ops->stack(data, "IRQ") < 0) 209 if (ops->stack(data, "IRQ") < 0)
209 break; 210 break;
210 bp = print_context_stack(tinfo, stack, bp, 211 bp = ops->walk_stack(tinfo, stack, bp,
211 ops, data, irq_stack_end, &graph); 212 ops, data, irq_stack_end, &graph);
212 /* 213 /*
213 * We link to the next stack (which would be 214 * We link to the next stack (which would be
@@ -228,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
228 /* 229 /*
229 * This handles the process stack: 230 * This handles the process stack:
230 */ 231 */
231 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); 232 bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
232 put_cpu(); 233 put_cpu();
233} 234}
234EXPORT_SYMBOL(dump_trace); 235EXPORT_SYMBOL(dump_trace);
@@ -291,6 +292,7 @@ void show_registers(struct pt_regs *regs)
291 292
292 sp = regs->sp; 293 sp = regs->sp;
293 printk("CPU %d ", cpu); 294 printk("CPU %d ", cpu);
295 print_modules();
294 __show_regs(regs, 1); 296 __show_regs(regs, 1);
295 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 297 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
296 cur->comm, cur->pid, task_thread_info(cur), cur); 298 cur->comm, cur->pid, task_thread_info(cur), cur);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 05ed7ab2ca48..7bca3c6a02fb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h> 15#include <linux/pfn.h>
21#include <linux/suspend.h> 16#include <linux/suspend.h>
22#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
23 18
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h> 19#include <asm/e820.h>
27#include <asm/proto.h> 20#include <asm/proto.h>
28#include <asm/setup.h> 21#include <asm/setup.h>
29#include <asm/trampoline.h>
30 22
31/* 23/*
32 * The e820 map is the map that gets modified e.g. with command line parameters 24 * The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,31 +509,55 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
517 int checktype) 509 int checktype)
518{ 510{
519 int i; 511 int i;
512 u64 end;
520 u64 real_removed_size = 0; 513 u64 real_removed_size = 0;
521 514
522 if (size > (ULLONG_MAX - start)) 515 if (size > (ULLONG_MAX - start))
523 size = ULLONG_MAX - start; 516 size = ULLONG_MAX - start;
524 517
518 end = start + size;
519 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
520 (unsigned long long) start,
521 (unsigned long long) end);
522 if (checktype)
523 e820_print_type(old_type);
524 printk(KERN_CONT "\n");
525
525 for (i = 0; i < e820.nr_map; i++) { 526 for (i = 0; i < e820.nr_map; i++) {
526 struct e820entry *ei = &e820.map[i]; 527 struct e820entry *ei = &e820.map[i];
527 u64 final_start, final_end; 528 u64 final_start, final_end;
529 u64 ei_end;
528 530
529 if (checktype && ei->type != old_type) 531 if (checktype && ei->type != old_type)
530 continue; 532 continue;
533
534 ei_end = ei->addr + ei->size;
531 /* totally covered? */ 535 /* totally covered? */
532 if (ei->addr >= start && 536 if (ei->addr >= start && ei_end <= end) {
533 (ei->addr + ei->size) <= (start + size)) {
534 real_removed_size += ei->size; 537 real_removed_size += ei->size;
535 memset(ei, 0, sizeof(struct e820entry)); 538 memset(ei, 0, sizeof(struct e820entry));
536 continue; 539 continue;
537 } 540 }
541
542 /* new range is totally covered? */
543 if (ei->addr < start && ei_end > end) {
544 e820_add_region(end, ei_end - end, ei->type);
545 ei->size = start - ei->addr;
546 real_removed_size += size;
547 continue;
548 }
549
538 /* partially covered */ 550 /* partially covered */
539 final_start = max(start, ei->addr); 551 final_start = max(start, ei->addr);
540 final_end = min(start + size, ei->addr + ei->size); 552 final_end = min(end, ei_end);
541 if (final_start >= final_end) 553 if (final_start >= final_end)
542 continue; 554 continue;
543 real_removed_size += final_end - final_start; 555 real_removed_size += final_end - final_start;
544 556
557 /*
558 * left range could be head or tail, so need to update
559 * size at first.
560 */
545 ei->size -= final_end - final_start; 561 ei->size -= final_end - final_start;
546 if (ei->addr < final_start) 562 if (ei->addr < final_start)
547 continue; 563 continue;
@@ -722,319 +738,44 @@ core_initcall(e820_mark_nvs_memory);
722#endif 738#endif
723 739
724/* 740/*
725 * Early reserved memory areas. 741 * Find a free area with specified alignment in a specific range.
726 */
727#define MAX_EARLY_RES 32
728
729struct early_res {
730 u64 start, end;
731 char name[16];
732 char overlap_ok;
733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
736#ifdef CONFIG_X86_32
737 /*
738 * But first pinch a few for the stack/trampoline stuff
739 * FIXME: Don't need the extra page at 4K, but need to fix
740 * trampoline before removing it. (see the GDT stuff)
741 */
742 { PAGE_SIZE, PAGE_SIZE, "EX TRAMPOLINE", 1 },
743#endif
744
745 {}
746};
747
748static int __init find_overlapped_early(u64 start, u64 end)
749{
750 int i;
751 struct early_res *r;
752
753 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
754 r = &early_res[i];
755 if (end > r->start && start < r->end)
756 break;
757 }
758
759 return i;
760}
761
762/*
763 * Drop the i-th range from the early reservation map,
764 * by copying any higher ranges down one over it, and
765 * clearing what had been the last slot.
766 */
767static void __init drop_range(int i)
768{
769 int j;
770
771 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
772 ;
773
774 memmove(&early_res[i], &early_res[i + 1],
775 (j - 1 - i) * sizeof(struct early_res));
776
777 early_res[j - 1].end = 0;
778}
779
780/*
781 * Split any existing ranges that:
782 * 1) are marked 'overlap_ok', and
783 * 2) overlap with the stated range [start, end)
784 * into whatever portion (if any) of the existing range is entirely
785 * below or entirely above the stated range. Drop the portion
786 * of the existing range that overlaps with the stated range,
787 * which will allow the caller of this routine to then add that
788 * stated range without conflicting with any existing range.
789 */ 742 */
790static void __init drop_overlaps_that_are_ok(u64 start, u64 end) 743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
791{ 744{
792 int i; 745 int i;
793 struct early_res *r;
794 u64 lower_start, lower_end;
795 u64 upper_start, upper_end;
796 char name[16];
797 746
798 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { 747 for (i = 0; i < e820.nr_map; i++) {
799 r = &early_res[i]; 748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
800 751
801 /* Continue past non-overlapping ranges */ 752 if (ei->type != E820_RAM)
802 if (end <= r->start || start >= r->end)
803 continue; 753 continue;
804 754
805 /* 755 ei_last = ei->addr + ei->size;
806 * Leave non-ok overlaps as is; let caller 756 ei_start = ei->addr;
807 * panic "Overlapping early reservations" 757 addr = find_early_area(ei_start, ei_last, start, end,
808 * when it hits this overlap. 758 size, align);
809 */
810 if (!r->overlap_ok)
811 return;
812
813 /*
814 * We have an ok overlap. We will drop it from the early
815 * reservation map, and add back in any non-overlapping
816 * portions (lower or upper) as separate, overlap_ok,
817 * non-overlapping ranges.
818 */
819
820 /* 1. Note any non-overlapping (lower or upper) ranges. */
821 strncpy(name, r->name, sizeof(name) - 1);
822
823 lower_start = lower_end = 0;
824 upper_start = upper_end = 0;
825 if (r->start < start) {
826 lower_start = r->start;
827 lower_end = start;
828 }
829 if (r->end > end) {
830 upper_start = end;
831 upper_end = r->end;
832 }
833
834 /* 2. Drop the original ok overlapping range */
835 drop_range(i);
836
837 i--; /* resume for-loop on copied down entry */
838
839 /* 3. Add back in any non-overlapping ranges. */
840 if (lower_end)
841 reserve_early_overlap_ok(lower_start, lower_end, name);
842 if (upper_end)
843 reserve_early_overlap_ok(upper_start, upper_end, name);
844 }
845}
846
847static void __init __reserve_early(u64 start, u64 end, char *name,
848 int overlap_ok)
849{
850 int i;
851 struct early_res *r;
852
853 i = find_overlapped_early(start, end);
854 if (i >= MAX_EARLY_RES)
855 panic("Too many early reservations");
856 r = &early_res[i];
857 if (r->end)
858 panic("Overlapping early reservations "
859 "%llx-%llx %s to %llx-%llx %s\n",
860 start, end - 1, name?name:"", r->start,
861 r->end - 1, r->name);
862 r->start = start;
863 r->end = end;
864 r->overlap_ok = overlap_ok;
865 if (name)
866 strncpy(r->name, name, sizeof(r->name) - 1);
867}
868
869/*
870 * A few early reservtations come here.
871 *
872 * The 'overlap_ok' in the name of this routine does -not- mean it
873 * is ok for these reservations to overlap an earlier reservation.
874 * Rather it means that it is ok for subsequent reservations to
875 * overlap this one.
876 *
877 * Use this entry point to reserve early ranges when you are doing
878 * so out of "Paranoia", reserving perhaps more memory than you need,
879 * just in case, and don't mind a subsequent overlapping reservation
880 * that is known to be needed.
881 *
882 * The drop_overlaps_that_are_ok() call here isn't really needed.
883 * It would be needed if we had two colliding 'overlap_ok'
884 * reservations, so that the second such would not panic on the
885 * overlap with the first. We don't have any such as of this
886 * writing, but might as well tolerate such if it happens in
887 * the future.
888 */
889void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
890{
891 drop_overlaps_that_are_ok(start, end);
892 __reserve_early(start, end, name, 1);
893}
894
895/*
896 * Most early reservations come here.
897 *
898 * We first have drop_overlaps_that_are_ok() drop any pre-existing
899 * 'overlap_ok' ranges, so that we can then reserve this memory
900 * range without risk of panic'ing on an overlapping overlap_ok
901 * early reservation.
902 */
903void __init reserve_early(u64 start, u64 end, char *name)
904{
905 if (start >= end)
906 return;
907
908 drop_overlaps_that_are_ok(start, end);
909 __reserve_early(start, end, name, 0);
910}
911
912void __init free_early(u64 start, u64 end)
913{
914 struct early_res *r;
915 int i;
916
917 i = find_overlapped_early(start, end);
918 r = &early_res[i];
919 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
920 panic("free_early on not reserved area: %llx-%llx!",
921 start, end - 1);
922
923 drop_range(i);
924}
925
926void __init early_res_to_bootmem(u64 start, u64 end)
927{
928 int i, count;
929 u64 final_start, final_end;
930
931 count = 0;
932 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
933 count++;
934
935 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
936 count, start, end);
937 for (i = 0; i < count; i++) {
938 struct early_res *r = &early_res[i];
939 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
940 r->start, r->end, r->name);
941 final_start = max(start, r->start);
942 final_end = min(end, r->end);
943 if (final_start >= final_end) {
944 printk(KERN_CONT "\n");
945 continue;
946 }
947 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
948 final_start, final_end);
949 reserve_bootmem_generic(final_start, final_end - final_start,
950 BOOTMEM_DEFAULT);
951 }
952}
953 759
954/* Check for already reserved areas */ 760 if (addr != -1ULL)
955static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) 761 return addr;
956{
957 int i;
958 u64 addr = *addrp;
959 int changed = 0;
960 struct early_res *r;
961again:
962 i = find_overlapped_early(addr, addr + size);
963 r = &early_res[i];
964 if (i < MAX_EARLY_RES && r->end) {
965 *addrp = addr = round_up(r->end, align);
966 changed = 1;
967 goto again;
968 } 762 }
969 return changed; 763 return -1ULL;
970} 764}
971 765
972/* Check for already reserved areas */ 766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
973static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
974{ 767{
975 int i; 768 return find_e820_area(start, end, size, align);
976 u64 addr = *addrp, last;
977 u64 size = *sizep;
978 int changed = 0;
979again:
980 last = addr + size;
981 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
982 struct early_res *r = &early_res[i];
983 if (last > r->start && addr < r->start) {
984 size = r->start - addr;
985 changed = 1;
986 goto again;
987 }
988 if (last > r->end && addr < r->end) {
989 addr = round_up(r->end, align);
990 size = last - addr;
991 changed = 1;
992 goto again;
993 }
994 if (last <= r->end && addr >= r->start) {
995 (*sizep)++;
996 return 0;
997 }
998 }
999 if (changed) {
1000 *addrp = addr;
1001 *sizep = size;
1002 }
1003 return changed;
1004} 769}
1005 770
1006/* 771u64 __init get_max_mapped(void)
1007 * Find a free area with specified alignment in a specific range.
1008 */
1009u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
1010{ 772{
1011 int i; 773 u64 end = max_pfn_mapped;
1012 774
1013 for (i = 0; i < e820.nr_map; i++) { 775 end <<= PAGE_SHIFT;
1014 struct e820entry *ei = &e820.map[i];
1015 u64 addr, last;
1016 u64 ei_last;
1017 776
1018 if (ei->type != E820_RAM) 777 return end;
1019 continue;
1020 addr = round_up(ei->addr, align);
1021 ei_last = ei->addr + ei->size;
1022 if (addr < start)
1023 addr = round_up(start, align);
1024 if (addr >= ei_last)
1025 continue;
1026 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1027 ;
1028 last = addr + size;
1029 if (last > ei_last)
1030 continue;
1031 if (last > end)
1032 continue;
1033 return addr;
1034 }
1035 return -1ULL;
1036} 778}
1037
1038/* 779/*
1039 * Find next free range after *start 780 * Find next free range after *start
1040 */ 781 */
@@ -1044,25 +785,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1044 785
1045 for (i = 0; i < e820.nr_map; i++) { 786 for (i = 0; i < e820.nr_map; i++) {
1046 struct e820entry *ei = &e820.map[i]; 787 struct e820entry *ei = &e820.map[i];
1047 u64 addr, last; 788 u64 addr;
1048 u64 ei_last; 789 u64 ei_start, ei_last;
1049 790
1050 if (ei->type != E820_RAM) 791 if (ei->type != E820_RAM)
1051 continue; 792 continue;
1052 addr = round_up(ei->addr, align); 793
1053 ei_last = ei->addr + ei->size; 794 ei_last = ei->addr + ei->size;
1054 if (addr < start) 795 ei_start = ei->addr;
1055 addr = round_up(start, align); 796 addr = find_early_area_size(ei_start, ei_last, start,
1056 if (addr >= ei_last) 797 sizep, align);
1057 continue; 798
1058 *sizep = ei_last - addr; 799 if (addr != -1ULL)
1059 while (bad_addr_size(&addr, sizep, align) && 800 return addr;
1060 addr + *sizep <= ei_last)
1061 ;
1062 last = addr + *sizep;
1063 if (last > ei_last)
1064 continue;
1065 return addr;
1066 } 801 }
1067 802
1068 return -1ULL; 803 return -1ULL;
@@ -1421,6 +1156,8 @@ void __init e820_reserve_resources_late(void)
1421 end = MAX_RESOURCE_SIZE; 1156 end = MAX_RESOURCE_SIZE;
1422 if (start >= end) 1157 if (start >= end)
1423 continue; 1158 continue;
1159 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1160 start, end);
1424 reserve_region_with_split(&iomem_resource, start, end, 1161 reserve_region_with_split(&iomem_resource, start, end,
1425 "RAM buffer"); 1162 "RAM buffer");
1426 } 1163 }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index cdcfb122f256..c2fa9b8b497e 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); 362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363 early_iounmap(tmp, 2); 363 early_iounmap(tmp, 2);
364 364
365 printk(KERN_INFO "EFI v%u.%.02u by %s \n", 365 printk(KERN_INFO "EFI v%u.%.02u by %s\n",
366 efi.systab->hdr.revision >> 16, 366 efi.systab->hdr.revision >> 16,
367 efi.systab->hdr.revision & 0xffff, vendor); 367 efi.systab->hdr.revision & 0xffff, vendor);
368 368
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 309689245431..cd37469b54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -30,14 +30,32 @@
30 30
31#ifdef CONFIG_DYNAMIC_FTRACE 31#ifdef CONFIG_DYNAMIC_FTRACE
32 32
33/*
34 * modifying_code is set to notify NMIs that they need to use
35 * memory barriers when entering or exiting. But we don't want
36 * to burden NMIs with unnecessary memory barriers when code
37 * modification is not being done (which is most of the time).
38 *
39 * A mutex is already held when ftrace_arch_code_modify_prepare
40 * and post_process are called. No locks need to be taken here.
41 *
42 * Stop machine will make sure currently running NMIs are done
43 * and new NMIs will see the updated variable before we need
44 * to worry about NMIs doing memory barriers.
45 */
46static int modifying_code __read_mostly;
47static DEFINE_PER_CPU(int, save_modifying_code);
48
33int ftrace_arch_code_modify_prepare(void) 49int ftrace_arch_code_modify_prepare(void)
34{ 50{
35 set_kernel_text_rw(); 51 set_kernel_text_rw();
52 modifying_code = 1;
36 return 0; 53 return 0;
37} 54}
38 55
39int ftrace_arch_code_modify_post_process(void) 56int ftrace_arch_code_modify_post_process(void)
40{ 57{
58 modifying_code = 0;
41 set_kernel_text_ro(); 59 set_kernel_text_ro();
42 return 0; 60 return 0;
43} 61}
@@ -149,6 +167,11 @@ static void ftrace_mod_code(void)
149 167
150void ftrace_nmi_enter(void) 168void ftrace_nmi_enter(void)
151{ 169{
170 __get_cpu_var(save_modifying_code) = modifying_code;
171
172 if (!__get_cpu_var(save_modifying_code))
173 return;
174
152 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
153 smp_rmb(); 176 smp_rmb();
154 ftrace_mod_code(); 177 ftrace_mod_code();
@@ -160,6 +183,9 @@ void ftrace_nmi_enter(void)
160 183
161void ftrace_nmi_exit(void) 184void ftrace_nmi_exit(void)
162{ 185{
186 if (!__get_cpu_var(save_modifying_code))
187 return;
188
163 /* Finish all executions before clearing nmi_running */ 189 /* Finish all executions before clearing nmi_running */
164 smp_mb(); 190 smp_mb();
165 atomic_dec(&nmi_running); 191 atomic_dec(&nmi_running);
@@ -484,13 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
484 } 510 }
485} 511}
486#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
487
488#ifdef CONFIG_FTRACE_SYSCALLS
489
490extern unsigned long *sys_call_table;
491
492unsigned long __init arch_syscall_addr(int nr)
493{
494 return (unsigned long)(&sys_call_table)[nr];
495}
496#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94c9069..b2e246037392 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h>
10 11
11#include <asm/setup.h> 12#include <asm/setup.h>
12#include <asm/sections.h> 13#include <asm/sections.h>
@@ -29,14 +30,25 @@ static void __init i386_default_early_setup(void)
29 30
30void __init i386_start_kernel(void) 31void __init i386_start_kernel(void)
31{ 32{
33#ifdef CONFIG_X86_TRAMPOLINE
34 /*
35 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff)
38 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
40 "EX TRAMPOLINE");
41#endif
42
32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
33 44
34#ifdef CONFIG_BLK_DEV_INITRD 45#ifdef CONFIG_BLK_DEV_INITRD
35 /* Reserve INITRD */ 46 /* Reserve INITRD */
36 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 47 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
48 /* Assume only end is not page aligned */
37 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 49 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
38 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 50 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
39 u64 ramdisk_end = ramdisk_image + ramdisk_size; 51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
40 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
41 } 53 }
42#endif 54#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5a9896ca1e7..7147143fd614 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data)
103#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */ 104 /* Reserve INITRD */
105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
106 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
107 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
108 unsigned long ramdisk_end = ramdisk_image + ramdisk_size; 109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
109 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
110 } 111 }
111#endif 112#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7fd318bac59c..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP
442 */ 442 */
443 cmpb $0,ready 443 cmpb $0,ready
444 jne 1f 444 jne 1f
445 movl $per_cpu__gdt_page,%eax 445 movl $gdt_page,%eax
446 movl $per_cpu__stack_canary,%ecx 446 movl $stack_canary,%ecx
447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
448 shrl $16, %ecx 448 shrl $16, %ecx
449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -706,7 +706,7 @@ idt_descr:
706 .word 0 # 32 bit align gdt_desc.address 706 .word 0 # 32 bit align gdt_desc.address
707ENTRY(early_gdt_descr) 707ENTRY(early_gdt_descr)
708 .word GDT_ENTRIES*8-1 708 .word GDT_ENTRIES*8-1
709 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ 709 .long gdt_page /* Overwritten for secondary CPUs */
710 710
711/* 711/*
712 * The boot_gdt must mirror the equivalent in setup.S and is 712 * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2d8b5035371c..3d1e6f16b7a6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
27#define GET_CR2_INTO_RCX movq %cr2, %rcx 27#define GET_CR2_INTO_RCX movq %cr2, %rcx
28#endif 28#endif
29 29
30/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
31 * because we need identity-mapped pages. 31 * because we need identity-mapped pages.
32 * 32 *
33 */ 33 */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba6e65884603..23b4ecdffa9b 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
4#include <linux/sysdev.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/slab.h>
7#include <linux/hpet.h> 8#include <linux/hpet.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/cpu.h> 10#include <linux/cpu.h>
@@ -34,6 +35,8 @@
34 */ 35 */
35unsigned long hpet_address; 36unsigned long hpet_address;
36u8 hpet_blockid; /* OS timer block num */ 37u8 hpet_blockid; /* OS timer block num */
38u8 hpet_msi_disable;
39
37#ifdef CONFIG_PCI_MSI 40#ifdef CONFIG_PCI_MSI
38static unsigned long hpet_num_timers; 41static unsigned long hpet_num_timers;
39#endif 42#endif
@@ -264,7 +267,7 @@ static void hpet_resume_device(void)
264 force_hpet_resume(); 267 force_hpet_resume();
265} 268}
266 269
267static void hpet_resume_counter(void) 270static void hpet_resume_counter(struct clocksource *cs)
268{ 271{
269 hpet_resume_device(); 272 hpet_resume_device();
270 hpet_restart_counter(); 273 hpet_restart_counter();
@@ -397,9 +400,15 @@ static int hpet_next_event(unsigned long delta,
397 * then we might have a real hardware problem. We can not do 400 * then we might have a real hardware problem. We can not do
398 * much about it here, but at least alert the user/admin with 401 * much about it here, but at least alert the user/admin with
399 * a prominent warning. 402 * a prominent warning.
403 * An erratum on some chipsets (ICH9,..), results in comparator read
404 * immediately following a write returning old value. Workaround
405 * for this is to read this value second time, when first
406 * read returns old value.
400 */ 407 */
401 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, 408 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
409 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
402 KERN_WARNING "hpet: compare register read back failed.\n"); 410 KERN_WARNING "hpet: compare register read back failed.\n");
411 }
403 412
404 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 413 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
405} 414}
@@ -596,6 +605,9 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
596 unsigned int num_timers_used = 0; 605 unsigned int num_timers_used = 0;
597 int i; 606 int i;
598 607
608 if (hpet_msi_disable)
609 return;
610
599 if (boot_cpu_has(X86_FEATURE_ARAT)) 611 if (boot_cpu_has(X86_FEATURE_ARAT))
600 return; 612 return;
601 id = hpet_readl(HPET_ID); 613 id = hpet_readl(HPET_ID);
@@ -928,6 +940,9 @@ static __init int hpet_late_init(void)
928 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 940 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
929 hpet_print_config(); 941 hpet_print_config();
930 942
943 if (hpet_msi_disable)
944 return 0;
945
931 if (boot_cpu_has(X86_FEATURE_ARAT)) 946 if (boot_cpu_has(X86_FEATURE_ARAT))
932 return 0; 947 return 0;
933 948
@@ -1135,6 +1150,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1135 do_div(clc, freq); 1150 do_div(clc, freq);
1136 clc >>= hpet_clockevent.shift; 1151 clc >>= hpet_clockevent.shift;
1137 hpet_pie_delta = clc; 1152 hpet_pie_delta = clc;
1153 hpet_pie_limit = 0;
1138 } 1154 }
1139 return 1; 1155 return 1;
1140} 1156}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 05d5fec64a94..d6cc065f519f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -212,25 +212,6 @@ static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); 212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213} 213}
214 214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type, 215int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type) 216 int *gen_len, int *gen_type)
236{ 217{
@@ -362,10 +343,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
362 return ret; 343 return ret;
363 } 344 }
364 345
365 ret = arch_store_info(bp);
366
367 if (ret < 0)
368 return ret;
369 /* 346 /*
370 * Check that the low-order bits of the address are appropriate 347 * Check that the low-order bits of the address are appropriate
371 * for the alignment implied by len. 348 * for the alignment implied by len.
@@ -502,8 +479,6 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
502 rcu_read_lock(); 479 rcu_read_lock();
503 480
504 bp = per_cpu(bp_per_reg[i], cpu); 481 bp = per_cpu(bp_per_reg[i], cpu);
505 if (bp)
506 rc = NOTIFY_DONE;
507 /* 482 /*
508 * Reset the 'i'th TRAP bit in dr6 to denote completion of 483 * Reset the 'i'th TRAP bit in dr6 to denote completion of
509 * exception handling 484 * exception handling
@@ -522,7 +497,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
522 497
523 rcu_read_unlock(); 498 rcu_read_unlock();
524 } 499 }
525 if (dr6 & (~DR_TRAP_BITS)) 500 /*
501 * Further processing in do_debug() is needed for a) user-space
502 * breakpoints (to generate signals) and b) when the system has
503 * taken exception due to multiple causes
504 */
505 if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
506 (dr6 & (~DR_TRAP_BITS)))
526 rc = NOTIFY_DONE; 507 rc = NOTIFY_DONE;
527 508
528 set_debugreg(dr7, 7); 509 set_debugreg(dr7, 7);
@@ -547,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
547{ 528{
548 /* TODO */ 529 /* TODO */
549} 530}
550
551void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
552{
553 /* TODO */
554}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3d..54c31c285488 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/regset.h> 9#include <linux/regset.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
11 12
12#include <asm/sigcontext.h> 13#include <asm/sigcontext.h>
13#include <asm/processor.h> 14#include <asm/processor.h>
@@ -164,6 +165,11 @@ int init_fpu(struct task_struct *tsk)
164 return 0; 165 return 0;
165} 166}
166 167
168/*
169 * The xstateregs_active() routine is the same as the fpregs_active() routine,
170 * as the "regset->n" for the xstate regset will be updated based on the feature
171 * capabilites supported by the xsave.
172 */
167int fpregs_active(struct task_struct *target, const struct user_regset *regset) 173int fpregs_active(struct task_struct *target, const struct user_regset *regset)
168{ 174{
169 return tsk_used_math(target) ? regset->n : 0; 175 return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +210,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
204 if (ret) 210 if (ret)
205 return ret; 211 return ret;
206 212
207 set_stopped_child_used_math(target);
208
209 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
210 &target->thread.xstate->fxsave, 0, -1); 214 &target->thread.xstate->fxsave, 0, -1);
211 215
@@ -224,6 +228,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
224 return ret; 228 return ret;
225} 229}
226 230
231int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
232 unsigned int pos, unsigned int count,
233 void *kbuf, void __user *ubuf)
234{
235 int ret;
236
237 if (!cpu_has_xsave)
238 return -ENODEV;
239
240 ret = init_fpu(target);
241 if (ret)
242 return ret;
243
244 /*
245 * Copy the 48bytes defined by the software first into the xstate
246 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout().
248 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251
252 /*
253 * Copy the xstate memory layout.
254 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1);
257 return ret;
258}
259
260int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
261 unsigned int pos, unsigned int count,
262 const void *kbuf, const void __user *ubuf)
263{
264 int ret;
265 struct xsave_hdr_struct *xsave_hdr;
266
267 if (!cpu_has_xsave)
268 return -ENODEV;
269
270 ret = init_fpu(target);
271 if (ret)
272 return ret;
273
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1);
276
277 /*
278 * mxcsr reserved bits must be masked to zero for security reasons.
279 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
281
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
283
284 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /*
286 * These bits must be zero.
287 */
288 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
289
290 return ret;
291}
292
227#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 293#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
228 294
229/* 295/*
@@ -404,8 +470,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
404 if (ret) 470 if (ret)
405 return ret; 471 return ret;
406 472
407 set_stopped_child_used_math(target);
408
409 if (!HAVE_HWFP) 473 if (!HAVE_HWFP)
410 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 474 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
411 475
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef80..7c9f02c130f3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/init.h> 9#include <linux/init.h>
11#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
@@ -32,8 +31,14 @@
32 */ 31 */
33 32
34static int i8259A_auto_eoi; 33static int i8259A_auto_eoi;
35DEFINE_SPINLOCK(i8259A_lock); 34DEFINE_RAW_SPINLOCK(i8259A_lock);
36static void mask_and_ack_8259A(unsigned int); 35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
37 42
38struct irq_chip i8259A_chip = { 43struct irq_chip i8259A_chip = {
39 .name = "XT-PIC", 44 .name = "XT-PIC",
@@ -63,51 +68,51 @@ unsigned int cached_irq_mask = 0xffff;
63 */ 68 */
64unsigned long io_apic_irqs; 69unsigned long io_apic_irqs;
65 70
66void disable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(unsigned int irq)
67{ 72{
68 unsigned int mask = 1 << irq; 73 unsigned int mask = 1 << irq;
69 unsigned long flags; 74 unsigned long flags;
70 75
71 spin_lock_irqsave(&i8259A_lock, flags); 76 raw_spin_lock_irqsave(&i8259A_lock, flags);
72 cached_irq_mask |= mask; 77 cached_irq_mask |= mask;
73 if (irq & 8) 78 if (irq & 8)
74 outb(cached_slave_mask, PIC_SLAVE_IMR); 79 outb(cached_slave_mask, PIC_SLAVE_IMR);
75 else 80 else
76 outb(cached_master_mask, PIC_MASTER_IMR); 81 outb(cached_master_mask, PIC_MASTER_IMR);
77 spin_unlock_irqrestore(&i8259A_lock, flags); 82 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
78} 83}
79 84
80void enable_8259A_irq(unsigned int irq) 85static void enable_8259A_irq(unsigned int irq)
81{ 86{
82 unsigned int mask = ~(1 << irq); 87 unsigned int mask = ~(1 << irq);
83 unsigned long flags; 88 unsigned long flags;
84 89
85 spin_lock_irqsave(&i8259A_lock, flags); 90 raw_spin_lock_irqsave(&i8259A_lock, flags);
86 cached_irq_mask &= mask; 91 cached_irq_mask &= mask;
87 if (irq & 8) 92 if (irq & 8)
88 outb(cached_slave_mask, PIC_SLAVE_IMR); 93 outb(cached_slave_mask, PIC_SLAVE_IMR);
89 else 94 else
90 outb(cached_master_mask, PIC_MASTER_IMR); 95 outb(cached_master_mask, PIC_MASTER_IMR);
91 spin_unlock_irqrestore(&i8259A_lock, flags); 96 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
92} 97}
93 98
94int i8259A_irq_pending(unsigned int irq) 99static int i8259A_irq_pending(unsigned int irq)
95{ 100{
96 unsigned int mask = 1<<irq; 101 unsigned int mask = 1<<irq;
97 unsigned long flags; 102 unsigned long flags;
98 int ret; 103 int ret;
99 104
100 spin_lock_irqsave(&i8259A_lock, flags); 105 raw_spin_lock_irqsave(&i8259A_lock, flags);
101 if (irq < 8) 106 if (irq < 8)
102 ret = inb(PIC_MASTER_CMD) & mask; 107 ret = inb(PIC_MASTER_CMD) & mask;
103 else 108 else
104 ret = inb(PIC_SLAVE_CMD) & (mask >> 8); 109 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
105 spin_unlock_irqrestore(&i8259A_lock, flags); 110 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
106 111
107 return ret; 112 return ret;
108} 113}
109 114
110void make_8259A_irq(unsigned int irq) 115static void make_8259A_irq(unsigned int irq)
111{ 116{
112 disable_irq_nosync(irq); 117 disable_irq_nosync(irq);
113 io_apic_irqs &= ~(1<<irq); 118 io_apic_irqs &= ~(1<<irq);
@@ -150,7 +155,7 @@ static void mask_and_ack_8259A(unsigned int irq)
150 unsigned int irqmask = 1 << irq; 155 unsigned int irqmask = 1 << irq;
151 unsigned long flags; 156 unsigned long flags;
152 157
153 spin_lock_irqsave(&i8259A_lock, flags); 158 raw_spin_lock_irqsave(&i8259A_lock, flags);
154 /* 159 /*
155 * Lightweight spurious IRQ detection. We do not want 160 * Lightweight spurious IRQ detection. We do not want
156 * to overdo spurious IRQ handling - it's usually a sign 161 * to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +188,7 @@ handle_real_irq:
183 outb(cached_master_mask, PIC_MASTER_IMR); 188 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ 189 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
185 } 190 }
186 spin_unlock_irqrestore(&i8259A_lock, flags); 191 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
187 return; 192 return;
188 193
189spurious_8259A_irq: 194spurious_8259A_irq:
@@ -281,37 +286,37 @@ static int __init i8259A_init_sysfs(void)
281 286
282device_initcall(i8259A_init_sysfs); 287device_initcall(i8259A_init_sysfs);
283 288
284void mask_8259A(void) 289static void mask_8259A(void)
285{ 290{
286 unsigned long flags; 291 unsigned long flags;
287 292
288 spin_lock_irqsave(&i8259A_lock, flags); 293 raw_spin_lock_irqsave(&i8259A_lock, flags);
289 294
290 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 295 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 296 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 297
293 spin_unlock_irqrestore(&i8259A_lock, flags); 298 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
294} 299}
295 300
296void unmask_8259A(void) 301static void unmask_8259A(void)
297{ 302{
298 unsigned long flags; 303 unsigned long flags;
299 304
300 spin_lock_irqsave(&i8259A_lock, flags); 305 raw_spin_lock_irqsave(&i8259A_lock, flags);
301 306
302 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 307 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
303 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 308 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
304 309
305 spin_unlock_irqrestore(&i8259A_lock, flags); 310 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
306} 311}
307 312
308void init_8259A(int auto_eoi) 313static void init_8259A(int auto_eoi)
309{ 314{
310 unsigned long flags; 315 unsigned long flags;
311 316
312 i8259A_auto_eoi = auto_eoi; 317 i8259A_auto_eoi = auto_eoi;
313 318
314 spin_lock_irqsave(&i8259A_lock, flags); 319 raw_spin_lock_irqsave(&i8259A_lock, flags);
315 320
316 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 321 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
317 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 322 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +361,49 @@ void init_8259A(int auto_eoi)
356 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 361 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
357 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 362 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
358 363
359 spin_unlock_irqrestore(&i8259A_lock, flags); 364 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
360} 365}
366
367/*
368 * make i8259 a driver so that we can select pic functions at run time. the goal
369 * is to make x86 binary compatible among pc compatible and non-pc compatible
370 * platforms, such as x86 MID.
371 */
372
373static void legacy_pic_noop(void) { };
374static void legacy_pic_uint_noop(unsigned int unused) { };
375static void legacy_pic_int_noop(int unused) { };
376
377static struct irq_chip dummy_pic_chip = {
378 .name = "dummy pic",
379 .mask = legacy_pic_uint_noop,
380 .unmask = legacy_pic_uint_noop,
381 .disable = legacy_pic_uint_noop,
382 .mask_ack = legacy_pic_uint_noop,
383};
384static int legacy_pic_irq_pending_noop(unsigned int irq)
385{
386 return 0;
387}
388
389struct legacy_pic null_legacy_pic = {
390 .nr_legacy_irqs = 0,
391 .chip = &dummy_pic_chip,
392 .mask_all = legacy_pic_noop,
393 .restore_mask = legacy_pic_noop,
394 .init = legacy_pic_int_noop,
395 .irq_pending = legacy_pic_irq_pending_noop,
396 .make_irq = legacy_pic_uint_noop,
397};
398
399struct legacy_pic default_legacy_pic = {
400 .nr_legacy_irqs = NR_IRQS_LEGACY,
401 .chip = &i8259A_chip,
402 .mask_all = mask_8259A,
403 .restore_mask = unmask_8259A,
404 .init = init_8259A,
405 .irq_pending = i8259A_irq_pending,
406 .make_irq = make_8259A_irq,
407};
408
409struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index d5932226614f..0ed2d300cd46 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/kprobes.h> 9#include <linux/kprobes.h>
11#include <linux/init.h> 10#include <linux/init.h>
@@ -84,24 +83,7 @@ static struct irqaction irq2 = {
84}; 83};
85 84
86DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 85DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
87 [0 ... IRQ0_VECTOR - 1] = -1, 86 [0 ... NR_VECTORS - 1] = -1,
88 [IRQ0_VECTOR] = 0,
89 [IRQ1_VECTOR] = 1,
90 [IRQ2_VECTOR] = 2,
91 [IRQ3_VECTOR] = 3,
92 [IRQ4_VECTOR] = 4,
93 [IRQ5_VECTOR] = 5,
94 [IRQ6_VECTOR] = 6,
95 [IRQ7_VECTOR] = 7,
96 [IRQ8_VECTOR] = 8,
97 [IRQ9_VECTOR] = 9,
98 [IRQ10_VECTOR] = 10,
99 [IRQ11_VECTOR] = 11,
100 [IRQ12_VECTOR] = 12,
101 [IRQ13_VECTOR] = 13,
102 [IRQ14_VECTOR] = 14,
103 [IRQ15_VECTOR] = 15,
104 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
105}; 87};
106 88
107int vector_used_by_percpu_irq(unsigned int vector) 89int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +105,12 @@ void __init init_ISA_irqs(void)
123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
124 init_bsp_APIC(); 106 init_bsp_APIC();
125#endif 107#endif
126 init_8259A(0); 108 legacy_pic->init(0);
127 109
128 /* 110 /*
129 * 16 old-style INTA-cycle interrupts: 111 * 16 old-style INTA-cycle interrupts:
130 */ 112 */
131 for (i = 0; i < NR_IRQS_LEGACY; i++) { 113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
132 struct irq_desc *desc = irq_to_desc(i); 114 struct irq_desc *desc = irq_to_desc(i);
133 115
134 desc->status = IRQ_DISABLED; 116 desc->status = IRQ_DISABLED;
@@ -142,9 +124,44 @@ void __init init_ISA_irqs(void)
142 124
143void __init init_IRQ(void) 125void __init init_IRQ(void)
144{ 126{
127 int i;
128
129 /*
130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
132 * then this configuration will likely be static after the boot. If
133 * these IRQ's are handled by more mordern controllers like IO-APIC,
134 * then this vector space can be freed and re-used dynamically as the
135 * irq's migrate etc.
136 */
137 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
138 per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
139
145 x86_init.irqs.intr_init(); 140 x86_init.irqs.intr_init();
146} 141}
147 142
143/*
144 * Setup the vector to irq mappings.
145 */
146void setup_vector_irq(int cpu)
147{
148#ifndef CONFIG_X86_IO_APIC
149 int irq;
150
151 /*
152 * On most of the platforms, legacy PIC delivers the interrupts on the
153 * boot cpu. But there are certain platforms where PIC interrupts are
154 * delivered to multiple cpu's. If the legacy IRQ is handled by the
155 * legacy PIC, for the new cpu that is coming online, setup the static
156 * legacy vector to irq mapping:
157 */
158 for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
159 per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
160#endif
161
162 __setup_vector_irq(cpu);
163}
164
148static void __init smp_intr_init(void) 165static void __init smp_intr_init(void)
149{ 166{
150#ifdef CONFIG_SMP 167#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index cbc4332a77b2..0f7bc20cfcde 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -2,8 +2,8 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/module.h> 9#include <linux/module.h>
@@ -121,3 +121,17 @@ void k8_flush_garts(void)
121} 121}
122EXPORT_SYMBOL_GPL(k8_flush_garts); 122EXPORT_SYMBOL_GPL(k8_flush_garts);
123 123
124static __init int init_k8_nbs(void)
125{
126 int err = 0;
127
128 err = cache_k8_northbridges();
129
130 if (err < 0)
131 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
132
133 return err;
134}
135
136/* This has to go after the PCI subsystem */
137fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index e444357375ce..8afd9f321f10 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/stat.h> 14#include <linux/stat.h>
14#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index dd74fe7273b1..b2258ca91003 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,6 +42,7 @@
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h>
45 46
46#include <asm/debugreg.h> 47#include <asm/debugreg.h>
47#include <asm/apicdef.h> 48#include <asm/apicdef.h>
@@ -204,40 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
204 205
205static struct hw_breakpoint { 206static struct hw_breakpoint {
206 unsigned enabled; 207 unsigned enabled;
207 unsigned type;
208 unsigned len;
209 unsigned long addr; 208 unsigned long addr;
209 int len;
210 int type;
211 struct perf_event **pev;
210} breakinfo[4]; 212} breakinfo[4];
211 213
212static void kgdb_correct_hw_break(void) 214static void kgdb_correct_hw_break(void)
213{ 215{
214 unsigned long dr7;
215 int correctit = 0;
216 int breakbit;
217 int breakno; 216 int breakno;
218 217
219 get_debugreg(dr7, 7);
220 for (breakno = 0; breakno < 4; breakno++) { 218 for (breakno = 0; breakno < 4; breakno++) {
221 breakbit = 2 << (breakno << 1); 219 struct perf_event *bp;
222 if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { 220 struct arch_hw_breakpoint *info;
223 correctit = 1; 221 int val;
224 dr7 |= breakbit; 222 int cpu = raw_smp_processor_id();
225 dr7 &= ~(0xf0000 << (breakno << 2)); 223 if (!breakinfo[breakno].enabled)
226 dr7 |= ((breakinfo[breakno].len << 2) | 224 continue;
227 breakinfo[breakno].type) << 225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
228 ((breakno << 2) + 16); 226 info = counter_arch_bp(bp);
229 set_debugreg(breakinfo[breakno].addr, breakno); 227 if (bp->attr.disabled != 1)
230 228 continue;
231 } else { 229 bp->attr.bp_addr = breakinfo[breakno].addr;
232 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 230 bp->attr.bp_len = breakinfo[breakno].len;
233 correctit = 1; 231 bp->attr.bp_type = breakinfo[breakno].type;
234 dr7 &= ~breakbit; 232 info->address = breakinfo[breakno].addr;
235 dr7 &= ~(0xf0000 << (breakno << 2)); 233 info->len = breakinfo[breakno].len;
236 } 234 info->type = breakinfo[breakno].type;
237 } 235 val = arch_install_hw_breakpoint(bp);
236 if (!val)
237 bp->attr.disabled = 0;
238 }
239 hw_breakpoint_restore();
240}
241
242static int hw_break_reserve_slot(int breakno)
243{
244 int cpu;
245 int cnt = 0;
246 struct perf_event **pevent;
247
248 for_each_online_cpu(cpu) {
249 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
251 if (dbg_reserve_bp_slot(*pevent))
252 goto fail;
253 }
254
255 return 0;
256
257fail:
258 for_each_online_cpu(cpu) {
259 cnt--;
260 if (!cnt)
261 break;
262 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
263 dbg_release_bp_slot(*pevent);
238 } 264 }
239 if (correctit) 265 return -1;
240 set_debugreg(dr7, 7); 266}
267
268static int hw_break_release_slot(int breakno)
269{
270 struct perf_event **pevent;
271 int cpu;
272
273 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent))
276 /*
277 * The debugger is responisble for handing the retry on
278 * remove failure.
279 */
280 return -1;
281 }
282 return 0;
241} 283}
242 284
243static int 285static int
@@ -251,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
251 if (i == 4) 293 if (i == 4)
252 return -1; 294 return -1;
253 295
296 if (hw_break_release_slot(i)) {
297 printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
298 return -1;
299 }
254 breakinfo[i].enabled = 0; 300 breakinfo[i].enabled = 0;
255 301
256 return 0; 302 return 0;
@@ -259,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
259static void kgdb_remove_all_hw_break(void) 305static void kgdb_remove_all_hw_break(void)
260{ 306{
261 int i; 307 int i;
308 int cpu = raw_smp_processor_id();
309 struct perf_event *bp;
262 310
263 for (i = 0; i < 4; i++) 311 for (i = 0; i < 4; i++) {
264 memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); 312 if (!breakinfo[i].enabled)
313 continue;
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1)
316 continue;
317 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1;
319 }
265} 320}
266 321
267static int 322static int
268kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) 323kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
269{ 324{
270 unsigned type;
271 int i; 325 int i;
272 326
273 for (i = 0; i < 4; i++) 327 for (i = 0; i < 4; i++)
@@ -278,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
278 332
279 switch (bptype) { 333 switch (bptype) {
280 case BP_HARDWARE_BREAKPOINT: 334 case BP_HARDWARE_BREAKPOINT:
281 type = 0; 335 len = 1;
282 len = 1; 336 breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
283 break; 337 break;
284 case BP_WRITE_WATCHPOINT: 338 case BP_WRITE_WATCHPOINT:
285 type = 1; 339 breakinfo[i].type = X86_BREAKPOINT_WRITE;
286 break; 340 break;
287 case BP_ACCESS_WATCHPOINT: 341 case BP_ACCESS_WATCHPOINT:
288 type = 3; 342 breakinfo[i].type = X86_BREAKPOINT_RW;
289 break; 343 break;
290 default: 344 default:
291 return -1; 345 return -1;
292 } 346 }
293 347 switch (len) {
294 if (len == 1 || len == 2 || len == 4) 348 case 1:
295 breakinfo[i].len = len - 1; 349 breakinfo[i].len = X86_BREAKPOINT_LEN_1;
296 else 350 break;
351 case 2:
352 breakinfo[i].len = X86_BREAKPOINT_LEN_2;
353 break;
354 case 4:
355 breakinfo[i].len = X86_BREAKPOINT_LEN_4;
356 break;
357#ifdef CONFIG_X86_64
358 case 8:
359 breakinfo[i].len = X86_BREAKPOINT_LEN_8;
360 break;
361#endif
362 default:
297 return -1; 363 return -1;
298 364 }
299 breakinfo[i].enabled = 1;
300 breakinfo[i].addr = addr; 365 breakinfo[i].addr = addr;
301 breakinfo[i].type = type; 366 if (hw_break_reserve_slot(i)) {
367 breakinfo[i].addr = 0;
368 return -1;
369 }
370 breakinfo[i].enabled = 1;
302 371
303 return 0; 372 return 0;
304} 373}
@@ -313,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
313 */ 382 */
314void kgdb_disable_hw_debug(struct pt_regs *regs) 383void kgdb_disable_hw_debug(struct pt_regs *regs)
315{ 384{
385 int i;
386 int cpu = raw_smp_processor_id();
387 struct perf_event *bp;
388
316 /* Disable hardware debugging while we are in kgdb: */ 389 /* Disable hardware debugging while we are in kgdb: */
317 set_debugreg(0UL, 7); 390 set_debugreg(0UL, 7);
391 for (i = 0; i < 4; i++) {
392 if (!breakinfo[i].enabled)
393 continue;
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1)
396 continue;
397 arch_uninstall_hw_breakpoint(bp);
398 bp->attr.disabled = 1;
399 }
318} 400}
319 401
320/** 402/**
@@ -378,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 struct pt_regs *linux_regs) 460 struct pt_regs *linux_regs)
379{ 461{
380 unsigned long addr; 462 unsigned long addr;
381 unsigned long dr6;
382 char *ptr; 463 char *ptr;
383 int newPC; 464 int newPC;
384 465
@@ -404,20 +485,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
404 raw_smp_processor_id()); 485 raw_smp_processor_id());
405 } 486 }
406 487
407 get_debugreg(dr6, 6);
408 if (!(dr6 & 0x4000)) {
409 int breakno;
410
411 for (breakno = 0; breakno < 4; breakno++) {
412 if (dr6 & (1 << breakno) &&
413 breakinfo[breakno].type == 0) {
414 /* Set restore flag: */
415 linux_regs->flags |= X86_EFLAGS_RF;
416 break;
417 }
418 }
419 }
420 set_debugreg(0UL, 6);
421 kgdb_correct_hw_break(); 488 kgdb_correct_hw_break();
422 489
423 return 0; 490 return 0;
@@ -485,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
485 break; 552 break;
486 553
487 case DIE_DEBUG: 554 case DIE_DEBUG:
488 if (atomic_read(&kgdb_cpu_doing_single_step) == 555 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
489 raw_smp_processor_id()) {
490 if (user_mode(regs)) 556 if (user_mode(regs))
491 return single_step_cont(regs, args); 557 return single_step_cont(regs, args);
492 break; 558 break;
@@ -539,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
539 */ 605 */
540int kgdb_arch_init(void) 606int kgdb_arch_init(void)
541{ 607{
542 return register_die_notifier(&kgdb_notifier); 608 int i, cpu;
609 int ret;
610 struct perf_event_attr attr;
611 struct perf_event **pevent;
612
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to
619 * complete.
620 */
621 hw_breakpoint_init(&attr);
622 attr.bp_addr = (unsigned long)kgdb_arch_init;
623 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1;
626 for (i = 0; i < 4; i++) {
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
630 breakinfo[i].pev = NULL;
631 kgdb_arch_exit();
632 return -1;
633 }
634 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
636 pevent[0]->hw.sample_period = 1;
637 if (pevent[0]->destroy != NULL) {
638 pevent[0]->destroy = NULL;
639 release_bp_slot(*pevent);
640 }
641 }
642 }
643 return ret;
543} 644}
544 645
545/** 646/**
@@ -550,6 +651,13 @@ int kgdb_arch_init(void)
550 */ 651 */
551void kgdb_arch_exit(void) 652void kgdb_arch_exit(void)
552{ 653{
654 int i;
655 for (i = 0; i < 4; i++) {
656 if (breakinfo[i].pev) {
657 unregister_wide_hw_breakpoint(breakinfo[i].pev);
658 breakinfo[i].pev = NULL;
659 }
660 }
553 unregister_die_notifier(&kgdb_notifier); 661 unregister_die_notifier(&kgdb_notifier);
554} 662}
555 663
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5b8c7505b3bc..b43bbaebe2c0 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
52 53
53#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
106}; 107};
107const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
108 109
109/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
110static void __kprobes set_jmp_op(void *from, void *to)
111{ 111{
112 struct __arch_jmp_op { 112 struct __arch_relative_insn {
113 char op; 113 u8 op;
114 s32 raddr; 114 s32 raddr;
115 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
116 jop = (struct __arch_jmp_op *)from; 116
117 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
118 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
119} 126}
120 127
121/* 128/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
202 /* 209 /*
203 * Basically, kp->ainsn.insn has an original instruction. 210 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping 211 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of 212 * at different place, __copy_instruction() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction 213 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn. 214 * from the kp->ainsn.insn.
208 * 215 *
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
284} 291}
285 292
286/* 293/*
287 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
288 * addressing mode. 295 * uses the %rip-relative addressing mode.
289 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
290 * If not, return null. 297 * If not, return null.
291 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
292 */ 299 */
293static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
294{ 301{
295#ifdef CONFIG_X86_64
296 struct insn insn; 302 struct insn insn;
297 kernel_insn_init(&insn, p->ainsn.insn); 303 int ret;
304 kprobe_opcode_t buf[MAX_INSN_SIZE];
298 305
306 kernel_insn_init(&insn, src);
307 if (recover) {
308 insn_get_opcode(&insn);
309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
310 ret = recover_probed_instruction(buf,
311 (unsigned long)src);
312 if (ret)
313 return 0;
314 kernel_insn_init(&insn, buf);
315 }
316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
319
320#ifdef CONFIG_X86_64
299 if (insn_rip_relative(&insn)) { 321 if (insn_rip_relative(&insn)) {
300 s64 newdisp; 322 s64 newdisp;
301 u8 *disp; 323 u8 *disp;
324 kernel_insn_init(&insn, dest);
302 insn_get_displacement(&insn); 325 insn_get_displacement(&insn);
303 /* 326 /*
304 * The copied instruction uses the %rip-relative addressing 327 * The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
312 * extension of the original signed 32-bit displacement would 335 * extension of the original signed 32-bit displacement would
313 * have given. 336 * have given.
314 */ 337 */
315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value - 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
316 (u8 *) p->ainsn.insn; 339 (u8 *) dest;
317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
319 *(s32 *) disp = (s32) newdisp; 342 *(s32 *) disp = (s32) newdisp;
320 } 343 }
321#endif 344#endif
345 return insn.length;
322} 346}
323 347
324static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
325{ 349{
326 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
327 351 * Copy an instruction without recovering int3, because it will be
328 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
329 355
330 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
331 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -337,6 +363,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
337 363
338int __kprobes arch_prepare_kprobe(struct kprobe *p) 364int __kprobes arch_prepare_kprobe(struct kprobe *p)
339{ 365{
366 if (alternatives_text_reserved(p->addr, p->addr))
367 return -EINVAL;
368
340 if (!can_probe((unsigned long)p->addr)) 369 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ; 370 return -EILSEQ;
342 /* insn: must be on special executable page on x86. */ 371 /* insn: must be on special executable page on x86. */
@@ -403,18 +432,6 @@ static void __kprobes restore_btf(void)
403 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
404} 433}
405 434
406static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
407{
408 clear_btf();
409 regs->flags |= X86_EFLAGS_TF;
410 regs->flags &= ~X86_EFLAGS_IF;
411 /* single step inline if the instruction is an int3 */
412 if (p->opcode == BREAKPOINT_INSTRUCTION)
413 regs->ip = (unsigned long)p->addr;
414 else
415 regs->ip = (unsigned long)p->ainsn.insn;
416}
417
418void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
419 struct pt_regs *regs) 436 struct pt_regs *regs)
420{ 437{
@@ -426,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
426 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
427} 444}
428 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
429static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
430 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
431{ 456{
432#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) 457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
460#if !defined(CONFIG_PREEMPT)
433 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
434 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
435 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
436 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
437 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
438 return; 472 return;
439 } 473 }
440#endif 474#endif
441 prepare_singlestep(p, regs); 475 if (reenter) {
442 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
443} 490}
444 491
445/* 492/*
@@ -453,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
453 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
454 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
455 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
456 save_previous_kprobe(kcb);
457 set_current_kprobe(p, regs, kcb);
458 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
459 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
460 kcb->kprobe_status = KPROBE_REENTER;
461 break; 505 break;
462 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
463 /* A probe has been hit in the codepath leading up to, or just 507 /* A probe has been hit in the codepath leading up to, or just
@@ -532,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
532 * more here. 576 * more here.
533 */ 577 */
534 if (!p->pre_handler || !p->pre_handler(p, regs)) 578 if (!p->pre_handler || !p->pre_handler(p, regs))
535 setup_singlestep(p, regs, kcb); 579 setup_singlestep(p, regs, kcb, 0);
536 return 1; 580 return 1;
537 } 581 }
538 } else if (kprobe_running()) { 582 } else if (kprobe_running()) {
539 p = __get_cpu_var(current_kprobe); 583 p = __get_cpu_var(current_kprobe);
540 if (p->break_handler && p->break_handler(p, regs)) { 584 if (p->break_handler && p->break_handler(p, regs)) {
541 setup_singlestep(p, regs, kcb); 585 setup_singlestep(p, regs, kcb, 0);
542 return 1; 586 return 1;
543 } 587 }
544 } /* else: not a kprobe fault; let the kernel handle it */ 588 } /* else: not a kprobe fault; let the kernel handle it */
@@ -547,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
547 return 0; 591 return 0;
548} 592}
549 593
594#ifdef CONFIG_X86_64
595#define SAVE_REGS_STRING \
596 /* Skip cs, ip, orig_ax. */ \
597 " subq $24, %rsp\n" \
598 " pushq %rdi\n" \
599 " pushq %rsi\n" \
600 " pushq %rdx\n" \
601 " pushq %rcx\n" \
602 " pushq %rax\n" \
603 " pushq %r8\n" \
604 " pushq %r9\n" \
605 " pushq %r10\n" \
606 " pushq %r11\n" \
607 " pushq %rbx\n" \
608 " pushq %rbp\n" \
609 " pushq %r12\n" \
610 " pushq %r13\n" \
611 " pushq %r14\n" \
612 " pushq %r15\n"
613#define RESTORE_REGS_STRING \
614 " popq %r15\n" \
615 " popq %r14\n" \
616 " popq %r13\n" \
617 " popq %r12\n" \
618 " popq %rbp\n" \
619 " popq %rbx\n" \
620 " popq %r11\n" \
621 " popq %r10\n" \
622 " popq %r9\n" \
623 " popq %r8\n" \
624 " popq %rax\n" \
625 " popq %rcx\n" \
626 " popq %rdx\n" \
627 " popq %rsi\n" \
628 " popq %rdi\n" \
629 /* Skip orig_ax, ip, cs */ \
630 " addq $24, %rsp\n"
631#else
632#define SAVE_REGS_STRING \
633 /* Skip cs, ip, orig_ax and gs. */ \
634 " subl $16, %esp\n" \
635 " pushl %fs\n" \
636 " pushl %ds\n" \
637 " pushl %es\n" \
638 " pushl %eax\n" \
639 " pushl %ebp\n" \
640 " pushl %edi\n" \
641 " pushl %esi\n" \
642 " pushl %edx\n" \
643 " pushl %ecx\n" \
644 " pushl %ebx\n"
645#define RESTORE_REGS_STRING \
646 " popl %ebx\n" \
647 " popl %ecx\n" \
648 " popl %edx\n" \
649 " popl %esi\n" \
650 " popl %edi\n" \
651 " popl %ebp\n" \
652 " popl %eax\n" \
653 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
654 " addl $24, %esp\n"
655#endif
656
550/* 657/*
551 * When a retprobed function returns, this code saves registers and 658 * When a retprobed function returns, this code saves registers and
552 * calls trampoline_handler() runs, which calls the kretprobe's handler. 659 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -560,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
560 /* We don't bother saving the ss register */ 667 /* We don't bother saving the ss register */
561 " pushq %rsp\n" 668 " pushq %rsp\n"
562 " pushfq\n" 669 " pushfq\n"
563 /* 670 SAVE_REGS_STRING
564 * Skip cs, ip, orig_ax.
565 * trampoline_handler() will plug in these values
566 */
567 " subq $24, %rsp\n"
568 " pushq %rdi\n"
569 " pushq %rsi\n"
570 " pushq %rdx\n"
571 " pushq %rcx\n"
572 " pushq %rax\n"
573 " pushq %r8\n"
574 " pushq %r9\n"
575 " pushq %r10\n"
576 " pushq %r11\n"
577 " pushq %rbx\n"
578 " pushq %rbp\n"
579 " pushq %r12\n"
580 " pushq %r13\n"
581 " pushq %r14\n"
582 " pushq %r15\n"
583 " movq %rsp, %rdi\n" 671 " movq %rsp, %rdi\n"
584 " call trampoline_handler\n" 672 " call trampoline_handler\n"
585 /* Replace saved sp with true return address. */ 673 /* Replace saved sp with true return address. */
586 " movq %rax, 152(%rsp)\n" 674 " movq %rax, 152(%rsp)\n"
587 " popq %r15\n" 675 RESTORE_REGS_STRING
588 " popq %r14\n"
589 " popq %r13\n"
590 " popq %r12\n"
591 " popq %rbp\n"
592 " popq %rbx\n"
593 " popq %r11\n"
594 " popq %r10\n"
595 " popq %r9\n"
596 " popq %r8\n"
597 " popq %rax\n"
598 " popq %rcx\n"
599 " popq %rdx\n"
600 " popq %rsi\n"
601 " popq %rdi\n"
602 /* Skip orig_ax, ip, cs */
603 " addq $24, %rsp\n"
604 " popfq\n" 676 " popfq\n"
605#else 677#else
606 " pushf\n" 678 " pushf\n"
607 /* 679 SAVE_REGS_STRING
608 * Skip cs, ip, orig_ax and gs.
609 * trampoline_handler() will plug in these values
610 */
611 " subl $16, %esp\n"
612 " pushl %fs\n"
613 " pushl %es\n"
614 " pushl %ds\n"
615 " pushl %eax\n"
616 " pushl %ebp\n"
617 " pushl %edi\n"
618 " pushl %esi\n"
619 " pushl %edx\n"
620 " pushl %ecx\n"
621 " pushl %ebx\n"
622 " movl %esp, %eax\n" 680 " movl %esp, %eax\n"
623 " call trampoline_handler\n" 681 " call trampoline_handler\n"
624 /* Move flags to cs */ 682 /* Move flags to cs */
@@ -626,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
626 " movl %edx, 52(%esp)\n" 684 " movl %edx, 52(%esp)\n"
627 /* Replace saved flags with true return address. */ 685 /* Replace saved flags with true return address. */
628 " movl %eax, 56(%esp)\n" 686 " movl %eax, 56(%esp)\n"
629 " popl %ebx\n" 687 RESTORE_REGS_STRING
630 " popl %ecx\n"
631 " popl %edx\n"
632 " popl %esi\n"
633 " popl %edi\n"
634 " popl %ebp\n"
635 " popl %eax\n"
636 /* Skip ds, es, fs, gs, orig_ax and ip */
637 " addl $24, %esp\n"
638 " popf\n" 688 " popf\n"
639#endif 689#endif
640 " ret\n"); 690 " ret\n");
@@ -802,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
802 * These instructions can be executed directly if it 852 * These instructions can be executed directly if it
803 * jumps back to correct address. 853 * jumps back to correct address.
804 */ 854 */
805 set_jmp_op((void *)regs->ip, 855 synthesize_reljump((void *)regs->ip,
806 (void *)orig_ip + (regs->ip - copy_ip)); 856 (void *)orig_ip + (regs->ip - copy_ip));
807 p->ainsn.boostable = 1; 857 p->ainsn.boostable = 1;
808 } else { 858 } else {
809 p->ainsn.boostable = -1; 859 p->ainsn.boostable = -1;
@@ -1030,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1030 return 0; 1080 return 0;
1031} 1081}
1032 1082
1083
1084#ifdef CONFIG_OPTPROBES
1085
1086/* Insert a call instruction at address 'from', which calls address 'to'.*/
1087static void __kprobes synthesize_relcall(void *from, void *to)
1088{
1089 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1090}
1091
1092/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1093static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1094 unsigned long val)
1095{
1096#ifdef CONFIG_X86_64
1097 *addr++ = 0x48;
1098 *addr++ = 0xbf;
1099#else
1100 *addr++ = 0xb8;
1101#endif
1102 *(unsigned long *)addr = val;
1103}
1104
1105void __kprobes kprobes_optinsn_template_holder(void)
1106{
1107 asm volatile (
1108 ".global optprobe_template_entry\n"
1109 "optprobe_template_entry: \n"
1110#ifdef CONFIG_X86_64
1111 /* We don't bother saving the ss register */
1112 " pushq %rsp\n"
1113 " pushfq\n"
1114 SAVE_REGS_STRING
1115 " movq %rsp, %rsi\n"
1116 ".global optprobe_template_val\n"
1117 "optprobe_template_val: \n"
1118 ASM_NOP5
1119 ASM_NOP5
1120 ".global optprobe_template_call\n"
1121 "optprobe_template_call: \n"
1122 ASM_NOP5
1123 /* Move flags to rsp */
1124 " movq 144(%rsp), %rdx\n"
1125 " movq %rdx, 152(%rsp)\n"
1126 RESTORE_REGS_STRING
1127 /* Skip flags entry */
1128 " addq $8, %rsp\n"
1129 " popfq\n"
1130#else /* CONFIG_X86_32 */
1131 " pushf\n"
1132 SAVE_REGS_STRING
1133 " movl %esp, %edx\n"
1134 ".global optprobe_template_val\n"
1135 "optprobe_template_val: \n"
1136 ASM_NOP5
1137 ".global optprobe_template_call\n"
1138 "optprobe_template_call: \n"
1139 ASM_NOP5
1140 RESTORE_REGS_STRING
1141 " addl $4, %esp\n" /* skip cs */
1142 " popf\n"
1143#endif
1144 ".global optprobe_template_end\n"
1145 "optprobe_template_end: \n");
1146}
1147
1148#define TMPL_MOVE_IDX \
1149 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1150#define TMPL_CALL_IDX \
1151 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1152#define TMPL_END_IDX \
1153 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1154
1155#define INT3_SIZE sizeof(kprobe_opcode_t)
1156
1157/* Optimized kprobe call back function: called from optinsn */
1158static void __kprobes optimized_callback(struct optimized_kprobe *op,
1159 struct pt_regs *regs)
1160{
1161 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1162
1163 preempt_disable();
1164 if (kprobe_running()) {
1165 kprobes_inc_nmissed_count(&op->kp);
1166 } else {
1167 /* Save skipped registers */
1168#ifdef CONFIG_X86_64
1169 regs->cs = __KERNEL_CS;
1170#else
1171 regs->cs = __KERNEL_CS | get_kernel_rpl();
1172 regs->gs = 0;
1173#endif
1174 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1175 regs->orig_ax = ~0UL;
1176
1177 __get_cpu_var(current_kprobe) = &op->kp;
1178 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1179 opt_pre_handler(&op->kp, regs);
1180 __get_cpu_var(current_kprobe) = NULL;
1181 }
1182 preempt_enable_no_resched();
1183}
1184
1185static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1186{
1187 int len = 0, ret;
1188
1189 while (len < RELATIVEJUMP_SIZE) {
1190 ret = __copy_instruction(dest + len, src + len, 1);
1191 if (!ret || !can_boost(dest + len))
1192 return -EINVAL;
1193 len += ret;
1194 }
1195 /* Check whether the address range is reserved */
1196 if (ftrace_text_reserved(src, src + len - 1) ||
1197 alternatives_text_reserved(src, src + len - 1))
1198 return -EBUSY;
1199
1200 return len;
1201}
1202
1203/* Check whether insn is indirect jump */
1204static int __kprobes insn_is_indirect_jump(struct insn *insn)
1205{
1206 return ((insn->opcode.bytes[0] == 0xff &&
1207 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1208 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1209}
1210
1211/* Check whether insn jumps into specified address range */
1212static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1213{
1214 unsigned long target = 0;
1215
1216 switch (insn->opcode.bytes[0]) {
1217 case 0xe0: /* loopne */
1218 case 0xe1: /* loope */
1219 case 0xe2: /* loop */
1220 case 0xe3: /* jcxz */
1221 case 0xe9: /* near relative jump */
1222 case 0xeb: /* short relative jump */
1223 break;
1224 case 0x0f:
1225 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1226 break;
1227 return 0;
1228 default:
1229 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1230 break;
1231 return 0;
1232 }
1233 target = (unsigned long)insn->next_byte + insn->immediate.value;
1234
1235 return (start <= target && target <= start + len);
1236}
1237
1238/* Decode whole function to ensure any instructions don't jump into target */
1239static int __kprobes can_optimize(unsigned long paddr)
1240{
1241 int ret;
1242 unsigned long addr, size = 0, offset = 0;
1243 struct insn insn;
1244 kprobe_opcode_t buf[MAX_INSN_SIZE];
1245 /* Dummy buffers for lookup_symbol_attrs */
1246 static char __dummy_buf[KSYM_NAME_LEN];
1247
1248 /* Lookup symbol including addr */
1249 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1250 return 0;
1251
1252 /* Check there is enough space for a relative jump. */
1253 if (size - offset < RELATIVEJUMP_SIZE)
1254 return 0;
1255
1256 /* Decode instructions */
1257 addr = paddr - offset;
1258 while (addr < paddr - offset + size) { /* Decode until function end */
1259 if (search_exception_tables(addr))
1260 /*
1261 * Since some fixup code will jumps into this function,
1262 * we can't optimize kprobe in this function.
1263 */
1264 return 0;
1265 kernel_insn_init(&insn, (void *)addr);
1266 insn_get_opcode(&insn);
1267 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1268 ret = recover_probed_instruction(buf, addr);
1269 if (ret)
1270 return 0;
1271 kernel_insn_init(&insn, buf);
1272 }
1273 insn_get_length(&insn);
1274 /* Recover address */
1275 insn.kaddr = (void *)addr;
1276 insn.next_byte = (void *)(addr + insn.length);
1277 /* Check any instructions don't jump into target */
1278 if (insn_is_indirect_jump(&insn) ||
1279 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1280 RELATIVE_ADDR_SIZE))
1281 return 0;
1282 addr += insn.length;
1283 }
1284
1285 return 1;
1286}
1287
1288/* Check optimized_kprobe can actually be optimized. */
1289int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1290{
1291 int i;
1292 struct kprobe *p;
1293
1294 for (i = 1; i < op->optinsn.size; i++) {
1295 p = get_kprobe(op->kp.addr + i);
1296 if (p && !kprobe_disabled(p))
1297 return -EEXIST;
1298 }
1299
1300 return 0;
1301}
1302
1303/* Check the addr is within the optimized instructions. */
1304int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1305 unsigned long addr)
1306{
1307 return ((unsigned long)op->kp.addr <= addr &&
1308 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1309}
1310
1311/* Free optimized instruction slot */
1312static __kprobes
1313void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1314{
1315 if (op->optinsn.insn) {
1316 free_optinsn_slot(op->optinsn.insn, dirty);
1317 op->optinsn.insn = NULL;
1318 op->optinsn.size = 0;
1319 }
1320}
1321
1322void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1323{
1324 __arch_remove_optimized_kprobe(op, 1);
1325}
1326
1327/*
1328 * Copy replacing target instructions
1329 * Target instructions MUST be relocatable (checked inside)
1330 */
1331int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1332{
1333 u8 *buf;
1334 int ret;
1335 long rel;
1336
1337 if (!can_optimize((unsigned long)op->kp.addr))
1338 return -EILSEQ;
1339
1340 op->optinsn.insn = get_optinsn_slot();
1341 if (!op->optinsn.insn)
1342 return -ENOMEM;
1343
1344 /*
1345 * Verify if the address gap is in 2GB range, because this uses
1346 * a relative jump.
1347 */
1348 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1349 if (abs(rel) > 0x7fffffff)
1350 return -ERANGE;
1351
1352 buf = (u8 *)op->optinsn.insn;
1353
1354 /* Copy instructions into the out-of-line buffer */
1355 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1356 if (ret < 0) {
1357 __arch_remove_optimized_kprobe(op, 0);
1358 return ret;
1359 }
1360 op->optinsn.size = ret;
1361
1362 /* Copy arch-dep-instance from template */
1363 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1364
1365 /* Set probe information */
1366 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1367
1368 /* Set probe function call */
1369 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1370
1371 /* Set returning jmp instruction at the tail of out-of-line buffer */
1372 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1373 (u8 *)op->kp.addr + op->optinsn.size);
1374
1375 flush_icache_range((unsigned long) buf,
1376 (unsigned long) buf + TMPL_END_IDX +
1377 op->optinsn.size + RELATIVEJUMP_SIZE);
1378 return 0;
1379}
1380
1381/* Replace a breakpoint (int3) with a relative jump. */
1382int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1383{
1384 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1385 s32 rel = (s32)((long)op->optinsn.insn -
1386 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1387
1388 /* Backup instructions which will be replaced by jump address */
1389 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1390 RELATIVE_ADDR_SIZE);
1391
1392 jmp_code[0] = RELATIVEJUMP_OPCODE;
1393 *(s32 *)(&jmp_code[1]) = rel;
1394
1395 /*
1396 * text_poke_smp doesn't support NMI/MCE code modifying.
1397 * However, since kprobes itself also doesn't support NMI/MCE
1398 * code probing, it's not a problem.
1399 */
1400 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1401 return 0;
1402}
1403
1404/* Replace a relative jump with a breakpoint (int3). */
1405void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1406{
1407 u8 buf[RELATIVEJUMP_SIZE];
1408
1409 /* Set int3 to first byte for kprobes */
1410 buf[0] = BREAKPOINT_INSTRUCTION;
1411 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1412 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1413}
1414
1415static int __kprobes setup_detour_execution(struct kprobe *p,
1416 struct pt_regs *regs,
1417 int reenter)
1418{
1419 struct optimized_kprobe *op;
1420
1421 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1422 /* This kprobe is really able to run optimized path. */
1423 op = container_of(p, struct optimized_kprobe, kp);
1424 /* Detour through copied instructions */
1425 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1426 if (!reenter)
1427 reset_current_kprobe();
1428 preempt_enable_no_resched();
1429 return 1;
1430 }
1431 return 0;
1432}
1433#endif
1434
1033int __init arch_init_kprobes(void) 1435int __init arch_init_kprobes(void)
1034{ 1436{
1035 return 0; 1437 return 0;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60cbd17..ea697263b373 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/errno.h> 9#include <linux/errno.h>
10#include <linux/gfp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/string.h> 12#include <linux/string.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4a8bb82248ae..035c8c529181 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/gfp.h>
12#include <linux/reboot.h> 13#include <linux/reboot.h>
13#include <linux/numa.h> 14#include <linux/numa.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 845d80ce1ef1..63eaf6596233 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -42,6 +42,7 @@
42#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/mca.h> 43#include <linux/mca.h>
44#include <linux/kprobes.h> 44#include <linux/kprobes.h>
45#include <linux/slab.h>
45#include <asm/system.h> 46#include <asm/system.h>
46#include <asm/io.h> 47#include <asm/io.h>
47#include <linux/proc_fs.h> 48#include <linux/proc_fs.h>
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 37542b67c57e..e1af7c055c7d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -36,9 +36,6 @@ MODULE_LICENSE("GPL v2");
36#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 36#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
37#define UCODE_UCODE_TYPE 0x00000001 37#define UCODE_UCODE_TYPE 0x00000001
38 38
39const struct firmware *firmware;
40static int supported_cpu;
41
42struct equiv_cpu_entry { 39struct equiv_cpu_entry {
43 u32 installed_cpu; 40 u32 installed_cpu;
44 u32 fixed_errata_mask; 41 u32 fixed_errata_mask;
@@ -77,12 +74,15 @@ static struct equiv_cpu_entry *equiv_cpu_table;
77 74
78static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 75static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
79{ 76{
77 struct cpuinfo_x86 *c = &cpu_data(cpu);
80 u32 dummy; 78 u32 dummy;
81 79
82 if (!supported_cpu)
83 return -1;
84
85 memset(csig, 0, sizeof(*csig)); 80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
83 "supported\n", cpu, c->x86);
84 return -1;
85 }
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
88 return 0; 88 return 0;
@@ -294,10 +294,14 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
294 294
295static enum ucode_state request_microcode_fw(int cpu, struct device *device) 295static enum ucode_state request_microcode_fw(int cpu, struct device *device)
296{ 296{
297 const char *fw_name = "amd-ucode/microcode_amd.bin";
298 const struct firmware *firmware;
297 enum ucode_state ret; 299 enum ucode_state ret;
298 300
299 if (firmware == NULL) 301 if (request_firmware(&firmware, fw_name, device)) {
302 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
300 return UCODE_NFOUND; 303 return UCODE_NFOUND;
304 }
301 305
302 if (*(u32 *)firmware->data != UCODE_MAGIC) { 306 if (*(u32 *)firmware->data != UCODE_MAGIC) {
303 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 307 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
@@ -307,6 +311,8 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
307 311
308 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 312 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
309 313
314 release_firmware(firmware);
315
310 return ret; 316 return ret;
311} 317}
312 318
@@ -325,31 +331,7 @@ static void microcode_fini_cpu_amd(int cpu)
325 uci->mc = NULL; 331 uci->mc = NULL;
326} 332}
327 333
328void init_microcode_amd(struct device *device)
329{
330 const char *fw_name = "amd-ucode/microcode_amd.bin";
331 struct cpuinfo_x86 *c = &boot_cpu_data;
332
333 WARN_ON(c->x86_vendor != X86_VENDOR_AMD);
334
335 if (c->x86 < 0x10) {
336 pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
337 return;
338 }
339 supported_cpu = 1;
340
341 if (request_firmware(&firmware, fw_name, device))
342 pr_err("failed to load file %s\n", fw_name);
343}
344
345void fini_microcode_amd(void)
346{
347 release_firmware(firmware);
348}
349
350static struct microcode_ops microcode_amd_ops = { 334static struct microcode_ops microcode_amd_ops = {
351 .init = init_microcode_amd,
352 .fini = fini_microcode_amd,
353 .request_microcode_user = request_microcode_user, 335 .request_microcode_user = request_microcode_user,
354 .request_microcode_fw = request_microcode_fw, 336 .request_microcode_fw = request_microcode_fw,
355 .collect_cpu_info = collect_cpu_info_amd, 337 .collect_cpu_info = collect_cpu_info_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 0c8632433090..cceb5bc3c3c2 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -521,9 +521,6 @@ static int __init microcode_init(void)
521 return PTR_ERR(microcode_pdev); 521 return PTR_ERR(microcode_pdev);
522 } 522 }
523 523
524 if (microcode_ops->init)
525 microcode_ops->init(&microcode_pdev->dev);
526
527 get_online_cpus(); 524 get_online_cpus();
528 mutex_lock(&microcode_mutex); 525 mutex_lock(&microcode_mutex);
529 526
@@ -566,9 +563,6 @@ static void __exit microcode_exit(void)
566 563
567 platform_device_unregister(microcode_pdev); 564 platform_device_unregister(microcode_pdev);
568 565
569 if (microcode_ops->fini)
570 microcode_ops->fini();
571
572 microcode_ops = NULL; 566 microcode_ops = NULL;
573 567
574 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 568 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index ebd193e476ca..85a343e28937 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -328,7 +328,7 @@ static int apply_microcode(int cpu)
328 cpu_num, mc_intel->hdr.rev); 328 cpu_num, mc_intel->hdr.rev);
329 return -1; 329 return -1;
330 } 330 }
331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
332 cpu_num, val[1], 332 cpu_num, val[1],
333 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
334 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..71825806cd44 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/dmi.h> 9#include <linux/dmi.h>
10#include <linux/range.h>
11
10#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
11#include <linux/sort.h> 13#include <linux/sort.h>
12#include <asm/io.h> 14#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
30 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 32 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
31}; 33};
32 34
33struct range {
34 u64 start;
35 u64 end;
36};
37
38static int __cpuinit cmp_range(const void *x1, const void *x2) 35static int __cpuinit cmp_range(const void *x1, const void *x2)
39{ 36{
40 const struct range *r1 = x1; 37 const struct range *r1 = x1;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 89f386f044e4..e0bc186d7501 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -23,6 +23,7 @@
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h>
26 27
27#include <asm/system.h> 28#include <asm/system.h>
28#include <asm/page.h> 29#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 40b54ceb68b5..e81030f71a8f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
359 x86_init.mpparse.mpc_record(1); 359 x86_init.mpparse.mpc_record(1);
360 } 360 }
361 361
362#ifdef CONFIG_X86_BIGSMP
363 generic_bigsmp_probe();
364#endif
365
366 if (apic->setup_apic_routing)
367 apic->setup_apic_routing();
368
369 if (!num_processors) 362 if (!num_processors)
370 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 363 printk(KERN_ERR "MPTABLE: no processors registered!\n");
371 return num_processors; 364 return num_processors;
@@ -671,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 664{
672 unsigned long size = get_mpc_size(mpf->physptr); 665 unsigned long size = get_mpc_size(mpf->physptr);
673 666
674 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); 667 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
675} 668}
676 669
677static int __init smp_scan_config(unsigned long base, unsigned long length) 670static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -700,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
700 mpf, (u64)virt_to_phys(mpf)); 693 mpf, (u64)virt_to_phys(mpf));
701 694
702 mem = virt_to_phys(mpf); 695 mem = virt_to_phys(mpf);
703 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); 696 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
704 if (mpf->physptr) 697 if (mpf->physptr)
705 smp_reserve_memory(mpf); 698 smp_reserve_memory(mpf);
706 699
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc871..0aad8670858e 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
13 17
14#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
30int sfi_mtimer_num;
31
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
33EXPORT_SYMBOL_GPL(sfi_mrtc_array);
34int sfi_mrtc_num;
35
36static inline void assign_to_mp_irq(struct mpc_intsrc *m,
37 struct mpc_intsrc *mp_irq)
38{
39 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
40}
41
42static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
43 struct mpc_intsrc *m)
44{
45 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
46}
47
48static void save_mp_irq(struct mpc_intsrc *m)
49{
50 int i;
51
52 for (i = 0; i < mp_irq_entries; i++) {
53 if (!mp_irq_cmp(&mp_irqs[i], m))
54 return;
55 }
56
57 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
58 if (++mp_irq_entries == MAX_IRQ_SOURCES)
59 panic("Max # of irq sources exceeded!!\n");
60}
61
62/* parse all the mtimer info to a static mtimer array */
63static int __init sfi_parse_mtmr(struct sfi_table_header *table)
64{
65 struct sfi_table_simple *sb;
66 struct sfi_timer_table_entry *pentry;
67 struct mpc_intsrc mp_irq;
68 int totallen;
69
70 sb = (struct sfi_table_simple *)table;
71 if (!sfi_mtimer_num) {
72 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
73 struct sfi_timer_table_entry);
74 pentry = (struct sfi_timer_table_entry *) sb->pentry;
75 totallen = sfi_mtimer_num * sizeof(*pentry);
76 memcpy(sfi_mtimer_array, pentry, totallen);
77 }
78
79 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
80 pentry = sfi_mtimer_array;
81 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
82 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
83 " irq = %d\n", totallen, (u32)pentry->phys_addr,
84 pentry->freq_hz, pentry->irq);
85 if (!pentry->irq)
86 continue;
87 mp_irq.type = MP_IOAPIC;
88 mp_irq.irqtype = mp_INT;
89/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
90 mp_irq.irqflag = 5;
91 mp_irq.srcbus = 0;
92 mp_irq.srcbusirq = pentry->irq; /* IRQ */
93 mp_irq.dstapic = MP_APIC_ALL;
94 mp_irq.dstirq = pentry->irq;
95 save_mp_irq(&mp_irq);
96 }
97
98 return 0;
99}
100
101struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
102{
103 int i;
104 if (hint < sfi_mtimer_num) {
105 if (!sfi_mtimer_usage[hint]) {
106 pr_debug("hint taken for timer %d irq %d\n",\
107 hint, sfi_mtimer_array[hint].irq);
108 sfi_mtimer_usage[hint] = 1;
109 return &sfi_mtimer_array[hint];
110 }
111 }
112 /* take the first timer available */
113 for (i = 0; i < sfi_mtimer_num;) {
114 if (!sfi_mtimer_usage[i]) {
115 sfi_mtimer_usage[i] = 1;
116 return &sfi_mtimer_array[i];
117 }
118 i++;
119 }
120 return NULL;
121}
122
123void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
124{
125 int i;
126 for (i = 0; i < sfi_mtimer_num;) {
127 if (mtmr->irq == sfi_mtimer_array[i].irq) {
128 sfi_mtimer_usage[i] = 0;
129 return;
130 }
131 i++;
132 }
133}
134
135/* parse all the mrtc info to a global mrtc array */
136int __init sfi_parse_mrtc(struct sfi_table_header *table)
137{
138 struct sfi_table_simple *sb;
139 struct sfi_rtc_table_entry *pentry;
140 struct mpc_intsrc mp_irq;
141
142 int totallen;
143
144 sb = (struct sfi_table_simple *)table;
145 if (!sfi_mrtc_num) {
146 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
147 struct sfi_rtc_table_entry);
148 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
149 totallen = sfi_mrtc_num * sizeof(*pentry);
150 memcpy(sfi_mrtc_array, pentry, totallen);
151 }
152
153 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
154 pentry = sfi_mrtc_array;
155 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
156 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
157 totallen, (u32)pentry->phys_addr, pentry->irq);
158 mp_irq.type = MP_IOAPIC;
159 mp_irq.irqtype = mp_INT;
160 mp_irq.irqflag = 0;
161 mp_irq.srcbus = 0;
162 mp_irq.srcbusirq = pentry->irq; /* IRQ */
163 mp_irq.dstapic = MP_APIC_ALL;
164 mp_irq.dstirq = pentry->irq;
165 save_mp_irq(&mp_irq);
166 }
167 return 0;
168}
169
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void)
183{
184 unsigned long flags, fast_calibrate;
185
186 local_irq_save(flags);
187 fast_calibrate = apbt_quick_calibrate();
188 local_irq_restore(flags);
189
190 if (fast_calibrate)
191 return fast_calibrate;
192
193 return 0;
194}
195
196void __init mrst_time_init(void)
197{
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0();
200 apbt_time_init();
201}
202
203void __init mrst_rtc_init(void)
204{
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206}
207
208/*
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
215 if (disable_apbt_percpu)
216 setup_boot_APIC_clock();
217};
15 218
16/* 219/*
17 * Moorestown specific x86_init function overrides and early setup 220 * Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
21{ 224{
22 x86_init.resources.probe_roms = x86_init_noop; 225 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop; 226 x86_init.resources.reserve_resources = x86_init_noop;
227
228 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
230
231 x86_init.irqs.pre_vector_init = x86_init_noop;
232
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
234
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
236 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop;
238
239 legacy_pic = &null_legacy_pic;
24} 240}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4bd93c9b2b27..4d4468e9f47c 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -37,6 +37,7 @@
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/gfp.h>
40 41
41#include <asm/processor.h> 42#include <asm/processor.h>
42#include <asm/msr.h> 43#include <asm/msr.h>
@@ -285,7 +286,7 @@ static void __exit msr_exit(void)
285 for_each_online_cpu(cpu) 286 for_each_online_cpu(cpu)
286 msr_device_destroy(cpu); 287 msr_device_destroy(cpu);
287 class_destroy(msr_class); 288 class_destroy(msr_class);
288 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 289 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
289 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 290 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
290} 291}
291 292
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 9d1d263f786f..8297160c41b3 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20
20#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h>
21#include <asm/olpc.h> 23#include <asm/olpc.h>
22 24
23#ifdef CONFIG_OPEN_FIRMWARE 25#ifdef CONFIG_OPEN_FIRMWARE
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
243 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, 245 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
244 (unsigned char *) &olpc_platform_info.ecver, 1); 246 (unsigned char *) &olpc_platform_info.ecver, 1);
245 247
246 /* check to see if the VSA exists */ 248#ifdef CONFIG_PCI_OLPC
247 if (cs5535_has_vsa2()) 249 /* If the VSA exists let it emulate PCI, if not emulate in kernel */
248 olpc_platform_info.flags |= OLPC_F_VSA; 250 if (!cs5535_has_vsa2())
251 x86_init.pci.arch_init = pci_olpc_init;
252#endif
249 253
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 254 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
251 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", 255 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d16310..1db183ed7c01 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 428 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 429 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
430 430
431#ifdef CONFIG_HIGHPTE
432 .kmap_atomic_pte = kmap_atomic,
433#endif
434
435#if PAGETABLE_LEVELS >= 3 431#if PAGETABLE_LEVELS >= 3
436#ifdef CONFIG_X86_PAE 432#ifdef CONFIG_X86_PAE
437 .set_pte_atomic = native_set_pte_atomic, 433 .set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 2bbde6078143..fb99f7edb341 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1309/* 1309/*
1310 * get_tce_space_from_tar(): 1310 * get_tce_space_from_tar():
1311 * Function for kdump case. Get the tce tables from first kernel 1311 * Function for kdump case. Get the tce tables from first kernel
1312 * by reading the contents of the base adress register of calgary iommu 1312 * by reading the contents of the base address register of calgary iommu
1313 */ 1313 */
1314static void __init get_tce_space_from_tar(void) 1314static void __init get_tce_space_from_tar(void)
1315{ 1315{
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e21f61a..4b7e3d8b01dd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -2,6 +2,7 @@
2#include <linux/dma-debug.h> 2#include <linux/dma-debug.h>
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/gfp.h>
5#include <linux/pci.h> 6#include <linux/pci.h>
6#include <linux/kmemleak.h> 7#include <linux/kmemleak.h>
7 8
@@ -38,7 +39,7 @@ int iommu_detected __read_mostly = 0;
38 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 39 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
39 * If this variable is 1, IOMMU implementations do no DMA translation for 40 * If this variable is 1, IOMMU implementations do no DMA translation for
40 * devices and allow every device to access to whole physical memory. This is 41 * devices and allow every device to access to whole physical memory. This is
41 * useful if a user want to use an IOMMU only for KVM device assignment to 42 * useful if a user wants to use an IOMMU only for KVM device assignment to
42 * guests and not for driver dma translation. 43 * guests and not for driver dma translation.
43 */ 44 */
44int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
@@ -65,7 +66,7 @@ int dma_set_mask(struct device *dev, u64 mask)
65} 66}
66EXPORT_SYMBOL(dma_set_mask); 67EXPORT_SYMBOL(dma_set_mask);
67 68
68#ifdef CONFIG_X86_64 69#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
69static __initdata void *dma32_bootmem_ptr; 70static __initdata void *dma32_bootmem_ptr;
70static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); 71static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
71 72
@@ -116,14 +117,21 @@ static void __init dma32_free_bootmem(void)
116 dma32_bootmem_ptr = NULL; 117 dma32_bootmem_ptr = NULL;
117 dma32_bootmem_size = 0; 118 dma32_bootmem_size = 0;
118} 119}
120#else
121void __init dma32_reserve_bootmem(void)
122{
123}
124static void __init dma32_free_bootmem(void)
125{
126}
127
119#endif 128#endif
120 129
121void __init pci_iommu_alloc(void) 130void __init pci_iommu_alloc(void)
122{ 131{
123#ifdef CONFIG_X86_64
124 /* free the range so iommu could get some range less than 4G */ 132 /* free the range so iommu could get some range less than 4G */
125 dma32_free_bootmem(); 133 dma32_free_bootmem();
126#endif 134
127 if (pci_swiotlb_detect()) 135 if (pci_swiotlb_detect())
128 goto out; 136 goto out;
129 137
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 34de53b46f87..0f7f130caa67 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -29,6 +29,7 @@
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/sysdev.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h>
32#include <asm/atomic.h> 33#include <asm/atomic.h>
33#include <asm/mtrr.h> 34#include <asm/mtrr.h>
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
@@ -564,6 +565,9 @@ static void enable_gart_translations(void)
564 565
565 enable_gart_translation(dev, __pa(agp_gatt_table)); 566 enable_gart_translation(dev, __pa(agp_gatt_table));
566 } 567 }
568
569 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts();
567} 571}
568 572
569/* 573/*
@@ -735,7 +739,7 @@ int __init gart_iommu_init(void)
735 unsigned long scratch; 739 unsigned long scratch;
736 long i; 740 long i;
737 741
738 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 742 if (num_k8_northbridges == 0)
739 return 0; 743 return 0;
740 744
741#ifndef CONFIG_AGP_AMD64 745#ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 22be12b60a8f..3af4af810c07 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -4,6 +4,7 @@
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/gfp.h>
7#include <linux/pci.h> 8#include <linux/pci.h>
8#include <linux/mm.h> 9#include <linux/mm.h>
9 10
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cf1e04b2ad65..28ad9f4d8b94 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -110,8 +110,8 @@ void show_regs_common(void)
110 if (!product) 110 if (!product)
111 product = ""; 111 product = "";
112 112
113 printk("\n"); 113 printk(KERN_CONT "\n");
114 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", 114 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
115 current->pid, current->comm, print_tainted(), 115 current->pid, current->comm, print_tainted(),
116 init_utsname()->release, 116 init_utsname()->release,
117 (int)strcspn(init_utsname()->version, " "), 117 (int)strcspn(init_utsname()->version, " "),
@@ -122,18 +122,6 @@ void flush_thread(void)
122{ 122{
123 struct task_struct *tsk = current; 123 struct task_struct *tsk = current;
124 124
125#ifdef CONFIG_X86_64
126 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
127 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
128 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
129 clear_tsk_thread_flag(tsk, TIF_IA32);
130 } else {
131 set_tsk_thread_flag(tsk, TIF_IA32);
132 current_thread_info()->status |= TS_COMPAT;
133 }
134 }
135#endif
136
137 flush_ptrace_hw_breakpoint(tsk); 125 flush_ptrace_hw_breakpoint(tsk);
138 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 126 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
139 /* 127 /*
@@ -295,6 +283,8 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
295 regs.es = __USER_DS; 283 regs.es = __USER_DS;
296 regs.fs = __KERNEL_PERCPU; 284 regs.fs = __KERNEL_PERCPU;
297 regs.gs = __KERNEL_STACK_CANARY; 285 regs.gs = __KERNEL_STACK_CANARY;
286#else
287 regs.ss = __KERNEL_DS;
298#endif 288#endif
299 289
300 regs.orig_ax = -1; 290 regs.orig_ax = -1;
@@ -536,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
536} 526}
537 527
538/* 528/*
539 * Check for AMD CPUs, which have potentially C1E support 529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
540 */ 533 */
541static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) 534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
542{ 535{
536 u64 val;
543 if (c->x86_vendor != X86_VENDOR_AMD) 537 if (c->x86_vendor != X86_VENDOR_AMD)
544 return 0; 538 goto no_c1e_idle;
545
546 if (c->x86 < 0x0F)
547 return 0;
548 539
549 /* Family 0x0f models < rev F do not have C1E */ 540 /* Family 0x0f models < rev F do not have C1E */
550 if (c->x86 == 0x0f && c->x86_model < 0x40) 541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
551 return 0; 542 return 1;
552 543
553 return 1; 544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
550 if (val >= 2) {
551 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
552 if (!(val & BIT(1)))
553 goto no_c1e_idle;
554 }
555 return 1;
556 }
557
558no_c1e_idle:
559 return 0;
554} 560}
555 561
556static cpumask_var_t c1e_mask; 562static cpumask_var_t c1e_mask;
@@ -617,7 +623,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
617{ 623{
618#ifdef CONFIG_SMP 624#ifdef CONFIG_SMP
619 if (pm_idle == poll_idle && smp_num_siblings > 1) { 625 if (pm_idle == poll_idle && smp_num_siblings > 1) {
620 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 626 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
621 " performance may degrade.\n"); 627 " performance may degrade.\n");
622 } 628 }
623#endif 629#endif
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index fe6a34e42bde..f6c62667e30c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -139,16 +139,16 @@ void __show_regs(struct pt_regs *regs, int all)
139 139
140 show_regs_common(); 140 show_regs_common();
141 141
142 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
143 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
144 smp_processor_id()); 144 smp_processor_id());
145 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
146 146
147 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
148 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
149 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
150 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
151 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
153 153
154 if (!all) 154 if (!all)
@@ -158,19 +158,19 @@ void __show_regs(struct pt_regs *regs, int all)
158 cr2 = read_cr2(); 158 cr2 = read_cr2();
159 cr3 = read_cr3(); 159 cr3 = read_cr3();
160 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
161 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
162 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
163 163
164 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
165 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
166 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
167 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
168 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
169 d0, d1, d2, d3); 169 d0, d1, d2, d3);
170 170
171 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
172 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
173 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
174 d6, d7); 174 d6, d7);
175} 175}
176 176
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 418f860880a2..dc9690b4c4cc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -161,19 +161,19 @@ void __show_regs(struct pt_regs *regs, int all)
161 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
162 162
163 show_regs_common(); 163 show_regs_common();
164 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
165 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
166 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
167 regs->sp, regs->flags); 167 regs->sp, regs->flags);
168 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
169 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
170 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
171 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
172 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
173 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
174 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
175 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
176 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
177 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
178 178
179 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -194,21 +194,21 @@ void __show_regs(struct pt_regs *regs, int all)
194 cr3 = read_cr3(); 194 cr3 = read_cr3();
195 cr4 = read_cr4(); 195 cr4 = read_cr4();
196 196
197 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
198 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
199 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
200 es, cr0); 200 es, cr0);
201 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
202 cr4); 202 cr4);
203 203
204 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
205 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
206 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
207 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
209 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
210 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
211 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 212}
213 213
214void release_thread(struct task_struct *dead_task) 214void release_thread(struct task_struct *dead_task)
@@ -515,6 +515,18 @@ void set_personality_64bit(void)
515 current->personality &= ~READ_IMPLIES_EXEC; 515 current->personality &= ~READ_IMPLIES_EXEC;
516} 516}
517 517
518void set_personality_ia32(void)
519{
520 /* inherit personality from parent */
521
522 /* Make sure to be in 32bit mode */
523 set_thread_flag(TIF_IA32);
524 current->personality |= force_personality32;
525
526 /* Prepare the first "return" to user space */
527 current_thread_info()->status |= TS_COMPAT;
528}
529
518unsigned long get_wchan(struct task_struct *p) 530unsigned long get_wchan(struct task_struct *p)
519{ 531{
520 unsigned long stack; 532 unsigned long stack;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 017d937639fe..2e9b55027b7e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
15#include <linux/ptrace.h> 16#include <linux/ptrace.h>
16#include <linux/regset.h> 17#include <linux/regset.h>
17#include <linux/tracehook.h> 18#include <linux/tracehook.h>
@@ -48,6 +49,7 @@ enum x86_regset {
48 REGSET_FP, 49 REGSET_FP,
49 REGSET_XFP, 50 REGSET_XFP,
50 REGSET_IOPERM64 = REGSET_XFP, 51 REGSET_IOPERM64 = REGSET_XFP,
52 REGSET_XSTATE,
51 REGSET_TLS, 53 REGSET_TLS,
52 REGSET_IOPERM32, 54 REGSET_IOPERM32,
53}; 55};
@@ -140,30 +142,6 @@ static const int arg_offs_table[] = {
140#endif 142#endif
141}; 143};
142 144
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
167/* 145/*
168 * does not yet catch signals sent when the child dies. 146 * does not yet catch signals sent when the child dies.
169 * in exit.c or in signal.c. 147 * in exit.c or in signal.c.
@@ -604,7 +582,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
604 struct perf_event_attr attr; 582 struct perf_event_attr attr;
605 583
606 /* 584 /*
607 * We shoud have at least an inactive breakpoint at this 585 * We should have at least an inactive breakpoint at this
608 * slot. It means the user is writing dr7 without having 586 * slot. It means the user is writing dr7 without having
609 * written the address register first 587 * written the address register first
610 */ 588 */
@@ -702,7 +680,7 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
702 } else if (n == 6) { 680 } else if (n == 6) {
703 val = thread->debugreg6; 681 val = thread->debugreg6;
704 } else if (n == 7) { 682 } else if (n == 7) {
705 val = ptrace_get_dr7(thread->ptrace_bps); 683 val = thread->ptrace_dr7;
706 } 684 }
707 return val; 685 return val;
708} 686}
@@ -778,8 +756,11 @@ int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
778 return rc; 756 return rc;
779 } 757 }
780 /* All that's left is DR7 */ 758 /* All that's left is DR7 */
781 if (n == 7) 759 if (n == 7) {
782 rc = ptrace_write_dr7(tsk, val); 760 rc = ptrace_write_dr7(tsk, val);
761 if (!rc)
762 thread->ptrace_dr7 = val;
763 }
783 764
784ret_path: 765ret_path:
785 return rc; 766 return rc;
@@ -1584,7 +1565,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1584 1565
1585#ifdef CONFIG_X86_64 1566#ifdef CONFIG_X86_64
1586 1567
1587static const struct user_regset x86_64_regsets[] = { 1568static struct user_regset x86_64_regsets[] __read_mostly = {
1588 [REGSET_GENERAL] = { 1569 [REGSET_GENERAL] = {
1589 .core_note_type = NT_PRSTATUS, 1570 .core_note_type = NT_PRSTATUS,
1590 .n = sizeof(struct user_regs_struct) / sizeof(long), 1571 .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1597,6 +1578,12 @@ static const struct user_regset x86_64_regsets[] = {
1597 .size = sizeof(long), .align = sizeof(long), 1578 .size = sizeof(long), .align = sizeof(long),
1598 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1579 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1599 }, 1580 },
1581 [REGSET_XSTATE] = {
1582 .core_note_type = NT_X86_XSTATE,
1583 .size = sizeof(u64), .align = sizeof(u64),
1584 .active = xstateregs_active, .get = xstateregs_get,
1585 .set = xstateregs_set
1586 },
1600 [REGSET_IOPERM64] = { 1587 [REGSET_IOPERM64] = {
1601 .core_note_type = NT_386_IOPERM, 1588 .core_note_type = NT_386_IOPERM,
1602 .n = IO_BITMAP_LONGS, 1589 .n = IO_BITMAP_LONGS,
@@ -1622,7 +1609,7 @@ static const struct user_regset_view user_x86_64_view = {
1622#endif /* CONFIG_X86_64 */ 1609#endif /* CONFIG_X86_64 */
1623 1610
1624#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1611#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1625static const struct user_regset x86_32_regsets[] = { 1612static struct user_regset x86_32_regsets[] __read_mostly = {
1626 [REGSET_GENERAL] = { 1613 [REGSET_GENERAL] = {
1627 .core_note_type = NT_PRSTATUS, 1614 .core_note_type = NT_PRSTATUS,
1628 .n = sizeof(struct user_regs_struct32) / sizeof(u32), 1615 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1641,6 +1628,12 @@ static const struct user_regset x86_32_regsets[] = {
1641 .size = sizeof(u32), .align = sizeof(u32), 1628 .size = sizeof(u32), .align = sizeof(u32),
1642 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1629 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1643 }, 1630 },
1631 [REGSET_XSTATE] = {
1632 .core_note_type = NT_X86_XSTATE,
1633 .size = sizeof(u64), .align = sizeof(u64),
1634 .active = xstateregs_active, .get = xstateregs_get,
1635 .set = xstateregs_set
1636 },
1644 [REGSET_TLS] = { 1637 [REGSET_TLS] = {
1645 .core_note_type = NT_386_TLS, 1638 .core_note_type = NT_386_TLS,
1646 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, 1639 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1663,6 +1656,23 @@ static const struct user_regset_view user_x86_32_view = {
1663}; 1656};
1664#endif 1657#endif
1665 1658
1659/*
1660 * This represents bytes 464..511 in the memory layout exported through
1661 * the REGSET_XSTATE interface.
1662 */
1663u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
1664
1665void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
1666{
1667#ifdef CONFIG_X86_64
1668 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1669#endif
1670#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1671 x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1672#endif
1673 xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
1674}
1675
1666const struct user_regset_view *task_user_regset_view(struct task_struct *task) 1676const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1667{ 1677{
1668#ifdef CONFIG_IA32_EMULATION 1678#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 18093d7498f0..12e9feaa2f7a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494
495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms.
498 */
499static void force_disable_hpet_msi(struct pci_dev *unused)
500{
501 hpet_msi_disable = 1;
502}
503
504DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
505 force_disable_hpet_msi);
506
494#endif 507#endif
495 508
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 509#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1545bc0c9845..8e1aac86b50c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
204 }, 204 },
205 }, 205 },
206 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
207 .callback = set_bios_reboot,
208 .ident = "Dell OptiPlex 760",
209 .matches = {
210 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
211 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
212 DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
213 },
214 },
206 { /* Handle problems with rebooting on Dell 2400's */ 215 { /* Handle problems with rebooting on Dell 2400's */
207 .callback = set_bios_reboot, 216 .callback = set_bios_reboot,
208 .ident = "Dell PowerEdge 2400", 217 .ident = "Dell PowerEdge 2400",
@@ -452,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
452 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), 461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
453 }, 462 },
454 }, 463 },
464 { /* Handle problems with rebooting on the iMac9,1. */
465 .callback = set_pci_reboot,
466 .ident = "Apple iMac9,1",
467 .matches = {
468 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
469 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
470 },
471 },
455 { } 472 { }
456}; 473};
457 474
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f7b8b9894b22..c4851eff57b3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -55,7 +55,6 @@
55#include <linux/stddef.h> 55#include <linux/stddef.h>
56#include <linux/unistd.h> 56#include <linux/unistd.h>
57#include <linux/ptrace.h> 57#include <linux/ptrace.h>
58#include <linux/slab.h>
59#include <linux/user.h> 58#include <linux/user.h>
60#include <linux/delay.h> 59#include <linux/delay.h>
61 60
@@ -121,7 +120,9 @@
121unsigned long max_low_pfn_mapped; 120unsigned long max_low_pfn_mapped;
122unsigned long max_pfn_mapped; 121unsigned long max_pfn_mapped;
123 122
123#ifdef CONFIG_DMI
124RESERVE_BRK(dmi_alloc, 65536); 124RESERVE_BRK(dmi_alloc, 65536);
125#endif
125 126
126unsigned int boot_cpu_id __read_mostly; 127unsigned int boot_cpu_id __read_mostly;
127 128
@@ -312,16 +313,17 @@ static void __init reserve_brk(void)
312#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 313#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
313static void __init relocate_initrd(void) 314static void __init relocate_initrd(void)
314{ 315{
315 316 /* Assume only end is not page aligned */
316 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 317 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
317 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 318 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
319 u64 area_size = PAGE_ALIGN(ramdisk_size);
318 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 320 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
319 u64 ramdisk_here; 321 u64 ramdisk_here;
320 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
321 char *p, *q; 323 char *p, *q;
322 324
323 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into lowmem */
324 ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, 326 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
325 PAGE_SIZE); 327 PAGE_SIZE);
326 328
327 if (ramdisk_here == -1ULL) 329 if (ramdisk_here == -1ULL)
@@ -330,7 +332,7 @@ static void __init relocate_initrd(void)
330 332
331 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the lowmem currently occupied by
332 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
333 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, 335 reserve_early(ramdisk_here, ramdisk_here + area_size,
334 "NEW RAMDISK"); 336 "NEW RAMDISK");
335 initrd_start = ramdisk_here + PAGE_OFFSET; 337 initrd_start = ramdisk_here + PAGE_OFFSET;
336 initrd_end = initrd_start + ramdisk_size; 338 initrd_end = initrd_start + ramdisk_size;
@@ -374,9 +376,10 @@ static void __init relocate_initrd(void)
374 376
375static void __init reserve_initrd(void) 377static void __init reserve_initrd(void)
376{ 378{
379 /* Assume only end is not page aligned */
377 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 380 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
378 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 381 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
379 u64 ramdisk_end = ramdisk_image + ramdisk_size; 382 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
380 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 383 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
381 384
382 if (!boot_params.hdr.type_of_loader || 385 if (!boot_params.hdr.type_of_loader ||
@@ -604,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg)
604early_param("elfcorehdr", setup_elfcorehdr); 607early_param("elfcorehdr", setup_elfcorehdr);
605#endif 608#endif
606 609
610static __init void reserve_ibft_region(void)
611{
612 unsigned long addr, size = 0;
613
614 addr = find_ibft_region(&size);
615
616 if (size)
617 reserve_early_overlap_ok(addr, addr + size, "ibft");
618}
619
607#ifdef CONFIG_X86_RESERVE_LOW_64K 620#ifdef CONFIG_X86_RESERVE_LOW_64K
608static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 621static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
609{ 622{
@@ -642,23 +655,48 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), 655 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
643 }, 656 },
644 }, 657 },
645 {
646 /* 658 /*
647 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 659 * AMI BIOS with low memory corruption was found on Intel DG45ID and
648 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 660 * DG45FC boards.
661 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
649 * match only DMI_BOARD_NAME and see if there is more bad products 662 * match only DMI_BOARD_NAME and see if there is more bad products
650 * with this vendor. 663 * with this vendor.
651 */ 664 */
665 {
652 .callback = dmi_low_memory_corruption, 666 .callback = dmi_low_memory_corruption,
653 .ident = "AMI BIOS", 667 .ident = "AMI BIOS",
654 .matches = { 668 .matches = {
655 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), 669 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
656 }, 670 },
657 }, 671 },
672 {
673 .callback = dmi_low_memory_corruption,
674 .ident = "AMI BIOS",
675 .matches = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 },
678 },
658#endif 679#endif
659 {} 680 {}
660}; 681};
661 682
683static void __init trim_bios_range(void)
684{
685 /*
686 * A special case is the first 4Kb of memory;
687 * This is a BIOS owned area, not kernel ram, but generally
688 * not listed as such in the E820 table.
689 */
690 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
691 /*
692 * special case: Some BIOSen report the PC BIOS
693 * area (640->1Mb) as ram even though it is not.
694 * take them out.
695 */
696 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
697 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
698}
699
662/* 700/*
663 * Determine if we were loaded by an EFI loader. If so, then we have also been 701 * Determine if we were loaded by an EFI loader. If so, then we have also been
664 * passed the efi memmap, systab, etc., so we should use these data structures 702 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -822,7 +860,7 @@ void __init setup_arch(char **cmdline_p)
822 insert_resource(&iomem_resource, &data_resource); 860 insert_resource(&iomem_resource, &data_resource);
823 insert_resource(&iomem_resource, &bss_resource); 861 insert_resource(&iomem_resource, &bss_resource);
824 862
825 863 trim_bios_range();
826#ifdef CONFIG_X86_32 864#ifdef CONFIG_X86_32
827 if (ppro_with_ram_bug()) { 865 if (ppro_with_ram_bug()) {
828 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, 866 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -881,6 +919,8 @@ void __init setup_arch(char **cmdline_p)
881 */ 919 */
882 find_smp_config(); 920 find_smp_config();
883 921
922 reserve_ibft_region();
923
884 reserve_trampoline_memory(); 924 reserve_trampoline_memory();
885 925
886#ifdef CONFIG_ACPI_SLEEP 926#ifdef CONFIG_ACPI_SLEEP
@@ -942,17 +982,11 @@ void __init setup_arch(char **cmdline_p)
942#endif 982#endif
943 983
944 initmem_init(0, max_pfn, acpi, k8); 984 initmem_init(0, max_pfn, acpi, k8);
945 985#ifndef CONFIG_NO_BOOTMEM
946#ifdef CONFIG_X86_64 986 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
947 /*
948 * dma32_reserve_bootmem() allocates bootmem which may conflict
949 * with the crashkernel command line, so do that after
950 * reserve_crashkernel()
951 */
952 dma32_reserve_bootmem();
953#endif 987#endif
954 988
955 reserve_ibft_region(); 989 dma32_reserve_bootmem();
956 990
957#ifdef CONFIG_KVM_CLOCK 991#ifdef CONFIG_KVM_CLOCK
958 kvmclock_init(); 992 kvmclock_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e9..ef6370b00e70 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
137 137
138static void __init pcpu_fc_free(void *ptr, size_t size) 138static void __init pcpu_fc_free(void *ptr, size_t size)
139{ 139{
140#ifdef CONFIG_NO_BOOTMEM
141 u64 start = __pa(ptr);
142 u64 end = start + size;
143 free_early_partial(start, end);
144#else
140 free_bootmem(__pa(ptr), size); 145 free_bootmem(__pa(ptr), size);
146#endif
141} 147}
142 148
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 149static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97600e7..d801210945d6 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
21#include <linux/cache.h> 21#include <linux/cache.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/gfp.h>
24 25
25#include <asm/mtrr.h> 26#include <asm/mtrr.h>
26#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 678d0b8c26f3..763d815e27a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,8 @@
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h> 50#include <linux/tboot.h>
51#include <linux/stackprotector.h>
52#include <linux/gfp.h>
51 53
52#include <asm/acpi.h> 54#include <asm/acpi.h>
53#include <asm/desc.h> 55#include <asm/desc.h>
@@ -67,6 +69,7 @@
67#include <linux/mc146818rtc.h> 69#include <linux/mc146818rtc.h>
68 70
69#include <asm/smpboot_hooks.h> 71#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h>
70 73
71#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
72u8 apicid_2_node[MAX_APICID]; 75u8 apicid_2_node[MAX_APICID];
@@ -240,7 +243,10 @@ static void __cpuinit smp_callin(void)
240 end_local_APIC_setup(); 243 end_local_APIC_setup();
241 map_cpu_to_logical_apicid(); 244 map_cpu_to_logical_apicid();
242 245
243 notify_cpu_starting(cpuid); 246 /*
247 * Need to setup vector mappings before we enable interrupts.
248 */
249 setup_vector_irq(smp_processor_id());
244 /* 250 /*
245 * Get our bogomips. 251 * Get our bogomips.
246 * 252 *
@@ -257,6 +263,8 @@ static void __cpuinit smp_callin(void)
257 */ 263 */
258 smp_store_cpu_info(cpuid); 264 smp_store_cpu_info(cpuid);
259 265
266 notify_cpu_starting(cpuid);
267
260 /* 268 /*
261 * Allow the master to continue. 269 * Allow the master to continue.
262 */ 270 */
@@ -286,9 +294,9 @@ notrace static void __cpuinit start_secondary(void *unused)
286 check_tsc_sync_target(); 294 check_tsc_sync_target();
287 295
288 if (nmi_watchdog == NMI_IO_APIC) { 296 if (nmi_watchdog == NMI_IO_APIC) {
289 disable_8259A_irq(0); 297 legacy_pic->chip->mask(0);
290 enable_NMI_through_LVT0(); 298 enable_NMI_through_LVT0();
291 enable_8259A_irq(0); 299 legacy_pic->chip->unmask(0);
292 } 300 }
293 301
294#ifdef CONFIG_X86_32 302#ifdef CONFIG_X86_32
@@ -315,15 +323,18 @@ notrace static void __cpuinit start_secondary(void *unused)
315 */ 323 */
316 ipi_call_lock(); 324 ipi_call_lock();
317 lock_vector_lock(); 325 lock_vector_lock();
318 __setup_vector_irq(smp_processor_id());
319 set_cpu_online(smp_processor_id(), true); 326 set_cpu_online(smp_processor_id(), true);
320 unlock_vector_lock(); 327 unlock_vector_lock();
321 ipi_call_unlock(); 328 ipi_call_unlock();
322 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 329 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
330 x86_platform.nmi_init();
323 331
324 /* enable local interrupts */ 332 /* enable local interrupts */
325 local_irq_enable(); 333 local_irq_enable();
326 334
335 /* to prevent fake stack check failure in clock setup */
336 boot_init_stack_canary();
337
327 x86_cpuinit.setup_percpu_clockev(); 338 x86_cpuinit.setup_percpu_clockev();
328 339
329 wmb(); 340 wmb();
@@ -1083,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1083 set_cpu_sibling_map(0); 1094 set_cpu_sibling_map(0);
1084 1095
1085 enable_IR_x2apic(); 1096 enable_IR_x2apic();
1086#ifdef CONFIG_X86_64
1087 default_setup_apic_routing(); 1097 default_setup_apic_routing();
1088#endif
1089 1098
1090 if (smp_sanity_check(max_cpus) < 0) { 1099 if (smp_sanity_check(max_cpus) < 0) {
1091 printk(KERN_INFO "SMP disabled\n"); 1100 printk(KERN_INFO "SMP disabled\n");
@@ -1213,11 +1222,12 @@ __init void prefill_possible_map(void)
1213 1222
1214 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1215 1224
1216 if (possible > CONFIG_NR_CPUS) { 1225 /* nr_cpu_ids could be reduced via nr_cpus= */
1226 if (possible > nr_cpu_ids) {
1217 printk(KERN_WARNING 1227 printk(KERN_WARNING
1218 "%d Processors exceeds NR_CPUS limit of %d\n", 1228 "%d Processors exceeds NR_CPUS limit of %d\n",
1219 possible, CONFIG_NR_CPUS); 1229 possible, nr_cpu_ids);
1220 possible = CONFIG_NR_CPUS; 1230 possible = nr_cpu_ids;
1221 } 1231 }
1222 1232
1223 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index dee1ff7cba58..196552bb412c 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -25,191 +25,6 @@
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27/* 27/*
28 * Perform the select(nd, in, out, ex, tv) and mmap() system
29 * calls. Linux/i386 didn't use to be able to handle more than
30 * 4 system call parameters, so these system calls used a memory
31 * block for parameter passing..
32 */
33
34struct mmap_arg_struct {
35 unsigned long addr;
36 unsigned long len;
37 unsigned long prot;
38 unsigned long flags;
39 unsigned long fd;
40 unsigned long offset;
41};
42
43asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
44{
45 struct mmap_arg_struct a;
46 int err = -EFAULT;
47
48 if (copy_from_user(&a, arg, sizeof(a)))
49 goto out;
50
51 err = -EINVAL;
52 if (a.offset & ~PAGE_MASK)
53 goto out;
54
55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
56 a.fd, a.offset >> PAGE_SHIFT);
57out:
58 return err;
59}
60
61
62struct sel_arg_struct {
63 unsigned long n;
64 fd_set __user *inp, *outp, *exp;
65 struct timeval __user *tvp;
66};
67
68asmlinkage int old_select(struct sel_arg_struct __user *arg)
69{
70 struct sel_arg_struct a;
71
72 if (copy_from_user(&a, arg, sizeof(a)))
73 return -EFAULT;
74 /* sys_select() does the appropriate kernel locking */
75 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
76}
77
78/*
79 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
80 *
81 * This is really horribly ugly.
82 */
83asmlinkage int sys_ipc(uint call, int first, int second,
84 int third, void __user *ptr, long fifth)
85{
86 int version, ret;
87
88 version = call >> 16; /* hack for backward compatibility */
89 call &= 0xffff;
90
91 switch (call) {
92 case SEMOP:
93 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
94 case SEMTIMEDOP:
95 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
96 (const struct timespec __user *)fifth);
97
98 case SEMGET:
99 return sys_semget(first, second, third);
100 case SEMCTL: {
101 union semun fourth;
102 if (!ptr)
103 return -EINVAL;
104 if (get_user(fourth.__pad, (void __user * __user *) ptr))
105 return -EFAULT;
106 return sys_semctl(first, second, third, fourth);
107 }
108
109 case MSGSND:
110 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
111 second, third);
112 case MSGRCV:
113 switch (version) {
114 case 0: {
115 struct ipc_kludge tmp;
116 if (!ptr)
117 return -EINVAL;
118
119 if (copy_from_user(&tmp,
120 (struct ipc_kludge __user *) ptr,
121 sizeof(tmp)))
122 return -EFAULT;
123 return sys_msgrcv(first, tmp.msgp, second,
124 tmp.msgtyp, third);
125 }
126 default:
127 return sys_msgrcv(first,
128 (struct msgbuf __user *) ptr,
129 second, fifth, third);
130 }
131 case MSGGET:
132 return sys_msgget((key_t) first, second);
133 case MSGCTL:
134 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
135
136 case SHMAT:
137 switch (version) {
138 default: {
139 ulong raddr;
140 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
141 if (ret)
142 return ret;
143 return put_user(raddr, (ulong __user *) third);
144 }
145 case 1: /* iBCS2 emulator entry point */
146 if (!segment_eq(get_fs(), get_ds()))
147 return -EINVAL;
148 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
149 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
150 }
151 case SHMDT:
152 return sys_shmdt((char __user *)ptr);
153 case SHMGET:
154 return sys_shmget(first, second, third);
155 case SHMCTL:
156 return sys_shmctl(first, second,
157 (struct shmid_ds __user *) ptr);
158 default:
159 return -ENOSYS;
160 }
161}
162
163/*
164 * Old cruft
165 */
166asmlinkage int sys_uname(struct old_utsname __user *name)
167{
168 int err;
169 if (!name)
170 return -EFAULT;
171 down_read(&uts_sem);
172 err = copy_to_user(name, utsname(), sizeof(*name));
173 up_read(&uts_sem);
174 return err? -EFAULT:0;
175}
176
177asmlinkage int sys_olduname(struct oldold_utsname __user *name)
178{
179 int error;
180
181 if (!name)
182 return -EFAULT;
183 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
184 return -EFAULT;
185
186 down_read(&uts_sem);
187
188 error = __copy_to_user(&name->sysname, &utsname()->sysname,
189 __OLD_UTS_LEN);
190 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
191 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
192 __OLD_UTS_LEN);
193 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
194 error |= __copy_to_user(&name->release, &utsname()->release,
195 __OLD_UTS_LEN);
196 error |= __put_user(0, name->release + __OLD_UTS_LEN);
197 error |= __copy_to_user(&name->version, &utsname()->version,
198 __OLD_UTS_LEN);
199 error |= __put_user(0, name->version + __OLD_UTS_LEN);
200 error |= __copy_to_user(&name->machine, &utsname()->machine,
201 __OLD_UTS_LEN);
202 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
203
204 up_read(&uts_sem);
205
206 error = error ? -EFAULT : 0;
207
208 return error;
209}
210
211
212/*
213 * Do a system call from kernel instead of calling sys_execve so we 28 * Do a system call from kernel instead of calling sys_execve so we
214 * end up with proper pt_regs. 29 * end up with proper pt_regs.
215 */ 30 */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 8aa2057efd12..ff14a5044ce6 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -209,15 +209,3 @@ bottomup:
209 209
210 return addr; 210 return addr;
211} 211}
212
213
214SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
215{
216 int err;
217 down_read(&uts_sem);
218 err = copy_to_user(name, utsname(), sizeof(*name));
219 up_read(&uts_sem);
220 if (personality(current->personality) == PER_LINUX32)
221 err |= copy_to_user(&name->machine, "i686", 5);
222 return err ? -EFAULT : 0;
223}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 15228b5d3eb7..8b3729341216 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
81 .long sys_settimeofday 81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */ 82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16 83 .long sys_setgroups16
84 .long old_select 84 .long sys_old_select
85 .long sys_symlink 85 .long sys_symlink
86 .long sys_lstat 86 .long sys_lstat
87 .long sys_readlink /* 85 */ 87 .long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long sys_old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long sys_old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
95 .long sys_ftruncate 95 .long sys_ftruncate
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed9..fb5cc5e14cfa 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
70 * manually to deassert NMI lines for the watchdog if run 70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system. 71 * on an 82489DX-based system.
72 */ 72 */
73 spin_lock(&i8259A_lock); 73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3); 74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */ 75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL); 76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock); 77 raw_spin_unlock(&i8259A_lock);
78 } 78 }
79 79
80 global_clock_event->event_handler(global_clock_event); 80 global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015efebc..17b03dd3a6b5 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -9,6 +9,7 @@
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/slab.h>
12 13
13#include <asm/mmu_context.h> 14#include <asm/mmu_context.h>
14#include <asm/uv/uv.h> 15#include <asm/uv/uv.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 33399176512a..1168e4454188 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -534,6 +534,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
534 534
535 get_debugreg(dr6, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Filter out all the reserved bits which are preset to 1 */
538 dr6 &= ~DR6_RESERVED;
539
537 /* Catch kmemcheck conditions first of all! */ 540 /* Catch kmemcheck conditions first of all! */
538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 541 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 542 return;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 597683aa5ba0..9faf91ae1841 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
50 * unstable. We do this because unlike Time Of Day, 50 * unstable. We do this because unlike Time Of Day,
51 * the scheduler clock tolerates small errors and it's 51 * the scheduler clock tolerates small errors and it's
52 * very important for it to be as fast as the platform 52 * very important for it to be as fast as the platform
53 * can achive it. ) 53 * can achieve it. )
54 */ 54 */
55 if (unlikely(tsc_disabled)) { 55 if (unlikely(tsc_disabled)) {
56 /* No locking but a rare wrong value is not a big deal: */ 56 /* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
740} 740}
741#endif 741#endif
742 742
743static void resume_tsc(void) 743static void resume_tsc(struct clocksource *cs)
744{ 744{
745 clocksource_tsc.cycle_last = 0; 745 clocksource_tsc.cycle_last = 0;
746} 746}
@@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
806 unsigned long res_low, res_high; 806 unsigned long res_low, res_high;
807 807
808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
809 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ 809 /* Geode_LX - the OLPC CPU has a very reliable TSC */
810 if (res_low & RTSC_SUSP) 810 if (res_low & RTSC_SUSP)
811 tsc_clocksource_reliable = 1; 811 tsc_clocksource_reliable = 1;
812#endif 812#endif
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8e3240..1d40336b030a 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/slab.h>
13#include <linux/irq.h> 14#include <linux/irq.h>
14 15
15#include <asm/apic.h> 16#include <asm/apic.h>
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a4..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
54 if (!sgi_uv_kobj) 54 if (!sgi_uv_kobj)
55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); 55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
56 if (!sgi_uv_kobj) { 56 if (!sgi_uv_kobj) {
57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); 57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
58 return -EINVAL; 58 return -EINVAL;
59 } 59 }
60 60
61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); 61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
62 if (ret) { 62 if (ret) {
63 printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); 63 printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
64 return ret; 64 return ret;
65 } 65 }
66 66
67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); 67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
68 if (ret) { 68 if (ret) {
69 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); 69 printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
70 return ret; 70 return ret;
71 } 71 }
72 72
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 3c84aa001c11..56e421bc379b 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -19,6 +19,7 @@
19 * Copyright (c) Dimitri Sivanich 19 * Copyright (c) Dimitri Sivanich
20 */ 20 */
21#include <linux/clockchips.h> 21#include <linux/clockchips.h>
22#include <linux/slab.h>
22 23
23#include <asm/uv/uv_mmrs.h> 24#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 25#include <asm/uv/uv_hub.h>
@@ -282,10 +283,21 @@ static int uv_rtc_unset_timer(int cpu, int force)
282 283
283/* 284/*
284 * Read the RTC. 285 * Read the RTC.
286 *
287 * Starting with HUB rev 2.0, the UV RTC register is replicated across all
288 * cachelines of it's own page. This allows faster simultaneous reads
289 * from a given socket.
285 */ 290 */
286static cycle_t uv_read_rtc(struct clocksource *cs) 291static cycle_t uv_read_rtc(struct clocksource *cs)
287{ 292{
288 return (cycle_t)uv_read_local_mmr(UVH_RTC); 293 unsigned long offset;
294
295 if (uv_get_min_hub_revision_id() == 1)
296 offset = 0;
297 else
298 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
299
300 return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
289} 301}
290 302
291/* 303/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 34a279a7471d..e680ea52db9b 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
49char visws_board_type = -1; 49char visws_board_type = -1;
50char visws_board_rev = -1; 50char visws_board_rev = -1;
51 51
52int is_visws_box(void)
53{
54 return visws_board_type >= 0;
55}
56
57static void __init visws_time_init(void) 52static void __init visws_time_init(void)
58{ 53{
59 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init; 237 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init; 238 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init; 239 x86_init.timers.timer_init = visws_time_init;
240 x86_init.pci.init = pci_visws_init;
241 x86_init.pci.init_irq = x86_init_noop;
245 242
246 /* 243 /*
247 * Install reboot quirks: 244 * Install reboot quirks:
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
508 */ 505 */
509static unsigned int startup_piix4_master_irq(unsigned int irq) 506static unsigned int startup_piix4_master_irq(unsigned int irq)
510{ 507{
511 init_8259A(0); 508 legacy_pic->init(0);
512 509
513 return startup_cobalt_irq(irq); 510 return startup_cobalt_irq(irq);
514} 511}
@@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = {
532 529
533static struct irq_chip piix4_virtual_irq_type = { 530static struct irq_chip piix4_virtual_irq_type = {
534 .name = "PIIX4-virtual", 531 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq,
538}; 532};
539 533
540 534
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
559 struct irq_desc *desc; 553 struct irq_desc *desc;
560 unsigned long flags; 554 unsigned long flags;
561 555
562 spin_lock_irqsave(&i8259A_lock, flags); 556 raw_spin_lock_irqsave(&i8259A_lock, flags);
563 557
564 /* Find out what's interrupting in the PIIX4 master 8259 */ 558 /* Find out what's interrupting in the PIIX4 master 8259 */
565 outb(0x0c, 0x20); /* OCW3 Poll command */ 559 outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
596 outb(0x60 + realirq, 0x20); 590 outb(0x60 + realirq, 0x20);
597 } 591 }
598 592
599 spin_unlock_irqrestore(&i8259A_lock, flags); 593 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
600 594
601 desc = irq_to_desc(realirq); 595 desc = irq_to_desc(realirq);
602 596
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
609 handle_IRQ_event(realirq, desc->action); 603 handle_IRQ_event(realirq, desc->action);
610 604
611 if (!(desc->status & IRQ_DISABLED)) 605 if (!(desc->status & IRQ_DISABLED))
612 enable_8259A_irq(realirq); 606 legacy_pic->chip->unmask(realirq);
613 607
614 return IRQ_HANDLED; 608 return IRQ_HANDLED;
615 609
616out_unlock: 610out_unlock:
617 spin_unlock_irqrestore(&i8259A_lock, flags); 611 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
618 return IRQ_NONE; 612 return IRQ_NONE;
619} 613}
620 614
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
628 .name = "cascade", 622 .name = "cascade",
629}; 623};
630 624
625static inline void set_piix4_virtual_irq_type(void)
626{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask;
630}
631 631
632void init_VISWS_APIC_irqs(void) 632void init_VISWS_APIC_irqs(void)
633{ 633{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
653 desc->chip = &piix4_master_irq_type; 653 desc->chip = &piix4_master_irq_type;
654 } 654 }
655 else if (i < CO_IRQ_APIC0) { 655 else if (i < CO_IRQ_APIC0) {
656 set_piix4_virtual_irq_type();
656 desc->chip = &piix4_virtual_irq_type; 657 desc->chip = &piix4_virtual_irq_type;
657 } 658 }
658 else if (IS_CO_APIC(i)) { 659 else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c30193..ce9fbacb7526 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -28,11 +28,13 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/gfp.h>
31#include <asm/vmi.h> 32#include <asm/vmi.h>
32#include <asm/io.h> 33#include <asm/io.h>
33#include <asm/fixmap.h> 34#include <asm/fixmap.h>
34#include <asm/apicdef.h> 35#include <asm/apicdef.h>
35#include <asm/apic.h> 36#include <asm/apic.h>
37#include <asm/pgalloc.h>
36#include <asm/processor.h> 38#include <asm/processor.h>
37#include <asm/timer.h> 39#include <asm/timer.h>
38#include <asm/vmi_time.h> 40#include <asm/vmi_time.h>
@@ -266,30 +268,6 @@ static void vmi_nop(void)
266{ 268{
267} 269}
268 270
269#ifdef CONFIG_HIGHPTE
270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
271{
272 void *va = kmap_atomic(page, type);
273
274 /*
275 * Internally, the VMI ROM must map virtual addresses to physical
276 * addresses for processing MMU updates. By the time MMU updates
277 * are issued, this information is typically already lost.
278 * Fortunately, the VMI provides a cache of mapping slots for active
279 * page tables.
280 *
281 * We use slot zero for the linear mapping of physical memory, and
282 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
283 *
284 * args: SLOT VA COUNT PFN
285 */
286 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
288
289 return va;
290}
291#endif
292
293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294{ 272{
295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +618,12 @@ static inline int __init activate_vmi(void)
640 u64 reloc; 618 u64 reloc;
641 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; 619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
642 620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
643 if (call_vrom_func(vmi_rom, vmi_init) != 0) { 627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
644 printk(KERN_ERR "VMI ROM failed to initialize!"); 628 printk(KERN_ERR "VMI ROM failed to initialize!");
645 return 0; 629 return 0;
@@ -778,10 +762,6 @@ static inline int __init activate_vmi(void)
778 762
779 /* Set linear is needed in all cases */ 763 /* Set linear is needed in all cases */
780 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
781#ifdef CONFIG_HIGHPTE
782 if (vmi_ops.set_linear_mapping)
783 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784#endif
785 765
786 /* 766 /*
787 * These MUST always be patched. Don't support indirect jumps 767 * These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 74c92bb194df..5e1ff66ecd73 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
79 79
80static inline unsigned int vmi_get_timer_vector(void) 80static inline unsigned int vmi_get_timer_vector(void)
81{ 81{
82#ifdef CONFIG_X86_IO_APIC 82 return IRQ0_VECTOR;
83 return FIRST_DEVICE_VECTOR;
84#else
85 return FIRST_EXTERNAL_VECTOR;
86#endif
87} 83}
88 84
89/** vmi clockchip */ 85/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
171{ 167{
172 /* Unfortunately, set_next_event interface only passes relative 168 /* Unfortunately, set_next_event interface only passes relative
173 * expiry, but we want absolute expiry. It'd be better if were 169 * expiry, but we want absolute expiry. It'd be better if were
174 * were passed an aboslute expiry, since a bunch of time may 170 * were passed an absolute expiry, since a bunch of time may
175 * have been stolen between the time the delta is computed and 171 * have been stolen between the time the delta is computed and
176 * when we set the alarm below. */ 172 * when we set the alarm below. */
177 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); 173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608cb..2cc249718c46 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -291,8 +291,8 @@ SECTIONS
291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
292 __smp_locks = .; 292 __smp_locks = .;
293 *(.smp_locks) 293 *(.smp_locks)
294 __smp_locks_end = .;
295 . = ALIGN(PAGE_SIZE); 294 . = ALIGN(PAGE_SIZE);
295 __smp_locks_end = .;
296 } 296 }
297 297
298#ifdef CONFIG_X86_64 298#ifdef CONFIG_X86_64
@@ -341,7 +341,7 @@ SECTIONS
341 * Per-cpu symbols which need to be offset from __per_cpu_load 341 * Per-cpu symbols which need to be offset from __per_cpu_load
342 * for the boot processor. 342 * for the boot processor.
343 */ 343 */
344#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load 344#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
345INIT_PER_CPU(gdt_page); 345INIT_PER_CPU(gdt_page);
346INIT_PER_CPU(irq_stack_union); 346INIT_PER_CPU(irq_stack_union);
347 347
@@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
352 "kernel image bigger than KERNEL_IMAGE_SIZE"); 352 "kernel image bigger than KERNEL_IMAGE_SIZE");
353 353
354#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
355. = ASSERT((per_cpu__irq_stack_union == 0), 355. = ASSERT((irq_stack_union == 0),
356 "irq_stack_union is not at start of per-cpu area"); 356 "irq_stack_union is not at start of per-cpu area");
357#endif 357#endif
358 358
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9055e5872ff0..1c0c6ab9c60f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,8 @@ static int __init vsyscall_init(void)
301 register_sysctl_table(kernel_root_table2); 301 register_sysctl_table(kernel_root_table2);
302#endif 302#endif
303 on_each_cpu(cpu_vsyscall_init, NULL, 1); 303 on_each_cpu(cpu_vsyscall_init, NULL, 1);
304 hotcpu_notifier(cpu_vsyscall_notifier, 0); 304 /* notifier priority > KVM */
305 hotcpu_notifier(cpu_vsyscall_notifier, 30);
305 return 0; 306 return 0;
306} 307}
307 308
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ccd179dec36e..61a1e8c7e19f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,9 +4,11 @@
4 * For licencing details see kernel-base/COPYING 4 * For licencing details see kernel-base/COPYING
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h>
7 8
8#include <asm/bios_ebda.h> 9#include <asm/bios_ebda.h>
9#include <asm/paravirt.h> 10#include <asm/paravirt.h>
11#include <asm/pci_x86.h>
10#include <asm/mpspec.h> 12#include <asm/mpspec.h>
11#include <asm/setup.h> 13#include <asm/setup.h>
12#include <asm/apic.h> 14#include <asm/apic.h>
@@ -70,16 +72,25 @@ struct x86_init_ops x86_init __initdata = {
70 .iommu = { 72 .iommu = {
71 .iommu_init = iommu_init_noop, 73 .iommu_init = iommu_init_noop,
72 }, 74 },
75
76 .pci = {
77 .init = x86_default_pci_init,
78 .init_irq = x86_default_pci_init_irq,
79 .fixup_irqs = x86_default_pci_fixup_irqs,
80 },
73}; 81};
74 82
75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 83struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
76 .setup_percpu_clockev = setup_secondary_APIC_clock, 84 .setup_percpu_clockev = setup_secondary_APIC_clock,
77}; 85};
78 86
87static void default_nmi_init(void) { };
88
79struct x86_platform_ops x86_platform = { 89struct x86_platform_ops x86_platform = {
80 .calibrate_tsc = native_calibrate_tsc, 90 .calibrate_tsc = native_calibrate_tsc,
81 .get_wallclock = mach_get_cmos_time, 91 .get_wallclock = mach_get_cmos_time,
82 .set_wallclock = mach_set_rtc_mmss, 92 .set_wallclock = mach_set_rtc_mmss,
83 .iommu_shutdown = iommu_shutdown_noop, 93 .iommu_shutdown = iommu_shutdown_noop,
84 .is_untracked_pat_range = is_ISA_range, 94 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init
85}; 96};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d9..782c3a362ec6 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 338 xstate_size = ebx;
339 339
340 update_regset_xstate_info(xstate_size, pcntxt_mask);
340 prepare_fx_sw_frame(); 341 prepare_fx_sw_frame();
341 342
342 setup_xstate_init(); 343 setup_xstate_init();