aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c114
-rw-r--r--arch/x86/kernel/alternative.c61
-rw-r--r--arch/x86/kernel/amd_iommu.c26
-rw-r--r--arch/x86/kernel/amd_iommu_init.c56
-rw-r--r--arch/x86/kernel/apb_timer.c785
-rw-r--r--arch/x86/kernel/aperture_64.c16
-rw-r--r--arch/x86/kernel/apic/apic.c10
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c352
-rw-r--r--arch/x86/kernel/apic/nmi.c15
-rw-r--r--arch/x86/kernel/apic/numaq_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c5
-rw-r--r--arch/x86/kernel/bootflag.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c621
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c24
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c5
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c208
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event.c269
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c142
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c97
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c18
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash.c6
-rw-r--r--arch/x86/kernel/crash_dump_32.c1
-rw-r--r--arch/x86/kernel/dumpstack.h24
-rw-r--r--arch/x86/kernel/dumpstack_64.c14
-rw-r--r--arch/x86/kernel/e820.c373
-rw-r--r--arch/x86/kernel/head32.c14
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/hpet.c12
-rw-r--r--arch/x86/kernel/hw_breakpoint.c12
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/i8259.c95
-rw-r--r--arch/x86/kernel/irqinit.c59
-rw-r--r--arch/x86/kernel/k8.c16
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kprobes.c609
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c4
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/olpc.c10
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c2
-rw-r--r--arch/x86/kernel/pci-dma.c16
-rw-r--r--arch/x86/kernel/pci-gart_64.c6
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/process.c34
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c35
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c25
-rw-r--r--arch/x86/kernel/sys_i386_32.c185
-rw-r--r--arch/x86/kernel/sys_x86_64.c12
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c1
-rw-r--r--arch/x86/kernel/tsc.c4
-rw-r--r--arch/x86/kernel/uv_irq.c1
-rw-r--r--arch/x86/kernel/uv_time.c1
-rw-r--r--arch/x86/kernel/visws_quirks.c27
-rw-r--r--arch/x86/kernel/vmi_32.c36
-rw-r--r--arch/x86/kernel/vmiclock_32.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S6
-rw-r--r--arch/x86/kernel/x86_init.c8
99 files changed, 3315 insertions, 1514 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d87f09bc5a5..4c58352209e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
88 88
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o
90 91
91obj-$(CONFIG_K8_NB) += k8.o 92obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f95703098f8..cd40aba6aa9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,10 +31,12 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/dmi.h> 32#include <linux/dmi.h>
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/slab.h>
34#include <linux/bootmem.h> 35#include <linux/bootmem.h>
35#include <linux/ioport.h> 36#include <linux/ioport.h>
36#include <linux/pci.h> 37#include <linux/pci.h>
37 38
39#include <asm/pci_x86.h>
38#include <asm/pgtable.h> 40#include <asm/pgtable.h>
39#include <asm/io_apic.h> 41#include <asm/io_apic.h>
40#include <asm/apic.h> 42#include <asm/apic.h>
@@ -447,6 +449,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
447int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
448{ 450{
449 *irq = gsi; 451 *irq = gsi;
452
453#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
455 setup_IO_APIC_irq_extra(gsi);
456#endif
457
450 return 0; 458 return 0;
451} 459}
452 460
@@ -474,7 +482,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
474 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
475 } 483 }
476#endif 484#endif
477 acpi_gsi_to_irq(plat_gsi, &irq); 485 irq = plat_gsi;
486
478 return irq; 487 return irq;
479} 488}
480 489
@@ -482,6 +491,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
482 * ACPI based hotplug support for CPU 491 * ACPI based hotplug support for CPU
483 */ 492 */
484#ifdef CONFIG_ACPI_HOTPLUG_CPU 493#ifdef CONFIG_ACPI_HOTPLUG_CPU
494#include <acpi/processor.h>
485 495
486static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) 496static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
487{ 497{
@@ -559,6 +569,8 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
559 goto free_new_map; 569 goto free_new_map;
560 } 570 }
561 571
572 acpi_processor_set_pdc(handle);
573
562 cpu = cpumask_first(new_map); 574 cpu = cpumask_first(new_map);
563 acpi_map_cpu2node(handle, cpu, physid); 575 acpi_map_cpu2node(handle, cpu, physid);
564 576
@@ -1285,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1285} 1297}
1286 1298
1287/* 1299/*
1288 * Limit ACPI to CPU enumeration for HT
1289 */
1290static int __init force_acpi_ht(const struct dmi_system_id *d)
1291{
1292 if (!acpi_force) {
1293 printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1294 d->ident);
1295 disable_acpi();
1296 acpi_ht = 1;
1297 } else {
1298 printk(KERN_NOTICE
1299 "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1300 }
1301 return 0;
1302}
1303
1304/*
1305 * Force ignoring BIOS IRQ0 pin2 override 1300 * Force ignoring BIOS IRQ0 pin2 override
1306 */ 1301 */
1307static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1302static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
@@ -1337,82 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1337 }, 1332 },
1338 1333
1339 /* 1334 /*
1340 * Boxes that need acpi=ht
1341 */
1342 {
1343 .callback = force_acpi_ht,
1344 .ident = "FSC Primergy T850",
1345 .matches = {
1346 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1347 DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1348 },
1349 },
1350 {
1351 .callback = force_acpi_ht,
1352 .ident = "HP VISUALIZE NT Workstation",
1353 .matches = {
1354 DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1355 DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1356 },
1357 },
1358 {
1359 .callback = force_acpi_ht,
1360 .ident = "Compaq Workstation W8000",
1361 .matches = {
1362 DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1363 DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1364 },
1365 },
1366 {
1367 .callback = force_acpi_ht,
1368 .ident = "ASUS CUR-DLS",
1369 .matches = {
1370 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1371 DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1372 },
1373 },
1374 {
1375 .callback = force_acpi_ht,
1376 .ident = "ABIT i440BX-W83977",
1377 .matches = {
1378 DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1379 DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1380 },
1381 },
1382 {
1383 .callback = force_acpi_ht,
1384 .ident = "IBM Bladecenter",
1385 .matches = {
1386 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1387 DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1388 },
1389 },
1390 {
1391 .callback = force_acpi_ht,
1392 .ident = "IBM eServer xSeries 360",
1393 .matches = {
1394 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1395 DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1396 },
1397 },
1398 {
1399 .callback = force_acpi_ht,
1400 .ident = "IBM eserver xSeries 330",
1401 .matches = {
1402 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1403 DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1404 },
1405 },
1406 {
1407 .callback = force_acpi_ht,
1408 .ident = "IBM eserver xSeries 440",
1409 .matches = {
1410 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1411 DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1412 },
1413 },
1414
1415 /*
1416 * Boxes that need ACPI PCI IRQ routing disabled 1335 * Boxes that need ACPI PCI IRQ routing disabled
1417 */ 1336 */
1418 { 1337 {
@@ -1617,6 +1536,9 @@ int __init acpi_boot_init(void)
1617 1536
1618 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); 1537 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
1619 1538
1539 if (!acpi_noirq)
1540 x86_init.pci.init = pci_acpi_init;
1541
1620 return 0; 1542 return 0;
1621} 1543}
1622 1544
@@ -1641,8 +1563,10 @@ static int __init parse_acpi(char *arg)
1641 } 1563 }
1642 /* Limit ACPI just to boot-time to enable HT */ 1564 /* Limit ACPI just to boot-time to enable HT */
1643 else if (strcmp(arg, "ht") == 0) { 1565 else if (strcmp(arg, "ht") == 0) {
1644 if (!acpi_force) 1566 if (!acpi_force) {
1567 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1645 disable_acpi(); 1568 disable_acpi();
1569 }
1646 acpi_ht = 1; 1570 acpi_ht = 1;
1647 } 1571 }
1648 /* acpi=rsdt use RSDT instead of XSDT */ 1572 /* acpi=rsdt use RSDT instead of XSDT */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index e6ea0342c8f..1a160d5d44d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,8 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
10#include <asm/alternative.h> 12#include <asm/alternative.h>
11#include <asm/sections.h> 13#include <asm/sections.h>
12#include <asm/pgtable.h> 14#include <asm/pgtable.h>
@@ -572,3 +574,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
572 local_irq_restore(flags); 574 local_irq_restore(flags);
573 return addr; 575 return addr;
574} 576}
577
578/*
579 * Cross-modifying kernel text with stop_machine().
580 * This code originally comes from immediate value.
581 */
582static atomic_t stop_machine_first;
583static int wrote_text;
584
585struct text_poke_params {
586 void *addr;
587 const void *opcode;
588 size_t len;
589};
590
591static int __kprobes stop_machine_text_poke(void *data)
592{
593 struct text_poke_params *tpp = data;
594
595 if (atomic_dec_and_test(&stop_machine_first)) {
596 text_poke(tpp->addr, tpp->opcode, tpp->len);
597 smp_wmb(); /* Make sure other cpus see that this has run */
598 wrote_text = 1;
599 } else {
600 while (!wrote_text)
601 cpu_relax();
602 smp_mb(); /* Load wrote_text before following execution */
603 }
604
605 flush_icache_range((unsigned long)tpp->addr,
606 (unsigned long)tpp->addr + tpp->len);
607 return 0;
608}
609
610/**
611 * text_poke_smp - Update instructions on a live kernel on SMP
612 * @addr: address to modify
613 * @opcode: source of the copy
614 * @len: length to copy
615 *
616 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
617 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
618 * should be allowed, since stop_machine() does _not_ protect code against
619 * NMI and MCE.
620 *
621 * Note: Must be called under get_online_cpus() and text_mutex.
622 */
623void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
624{
625 struct text_poke_params tpp;
626
627 tpp.addr = addr;
628 tpp.opcode = opcode;
629 tpp.len = len;
630 atomic_set(&stop_machine_first, 1);
631 wrote_text = 0;
632 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
633 return addr;
634}
635
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d8da9988edd..fa5a1474cd1 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,8 +18,8 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h>
22#include <linux/bitmap.h> 21#include <linux/bitmap.h>
22#include <linux/slab.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -118,7 +118,7 @@ static bool check_device(struct device *dev)
118 return false; 118 return false;
119 119
120 /* No device or no PCI device */ 120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type) 121 if (dev->bus != &pci_bus_type)
122 return false; 122 return false;
123 123
124 devid = get_device_id(dev); 124 devid = get_device_id(dev);
@@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
392 u32 tail, head; 392 u32 tail, head;
393 u8 *target; 393 u8 *target;
394 394
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
395 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
396 target = iommu->cmd_buf + tail; 397 target = iommu->cmd_buf + tail;
397 memcpy_toio(target, cmd, sizeof(*cmd)); 398 memcpy_toio(target, cmd, sizeof(*cmd));
@@ -2253,7 +2254,7 @@ static void prealloc_protection_domains(void)
2253 struct dma_ops_domain *dma_dom; 2254 struct dma_ops_domain *dma_dom;
2254 u16 devid; 2255 u16 devid;
2255 2256
2256 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2257 for_each_pci_dev(dev) {
2257 2258
2258 /* Do we handle this device? */ 2259 /* Do we handle this device? */
2259 if (!check_device(&dev->dev)) 2260 if (!check_device(&dev->dev))
@@ -2365,7 +2366,7 @@ static void cleanup_domain(struct protection_domain *domain)
2365 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { 2366 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2366 struct device *dev = dev_data->dev; 2367 struct device *dev = dev_data->dev;
2367 2368
2368 do_detach(dev); 2369 __detach_device(dev);
2369 atomic_set(&dev_data->bind, 0); 2370 atomic_set(&dev_data->bind, 0);
2370 } 2371 }
2371 2372
@@ -2394,6 +2395,7 @@ static struct protection_domain *protection_domain_alloc(void)
2394 return NULL; 2395 return NULL;
2395 2396
2396 spin_lock_init(&domain->lock); 2397 spin_lock_init(&domain->lock);
2398 mutex_init(&domain->api_lock);
2397 domain->id = domain_id_alloc(); 2399 domain->id = domain_id_alloc();
2398 if (!domain->id) 2400 if (!domain->id)
2399 goto out_err; 2401 goto out_err;
@@ -2446,9 +2448,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2446 2448
2447 free_pagetable(domain); 2449 free_pagetable(domain);
2448 2450
2449 domain_id_free(domain->id); 2451 protection_domain_free(domain);
2450
2451 kfree(domain);
2452 2452
2453 dom->priv = NULL; 2453 dom->priv = NULL;
2454} 2454}
@@ -2512,13 +2512,18 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
2512 unsigned long page_size = 0x1000UL << gfp_order; 2512 unsigned long page_size = 0x1000UL << gfp_order;
2513 struct protection_domain *domain = dom->priv; 2513 struct protection_domain *domain = dom->priv;
2514 int prot = 0; 2514 int prot = 0;
2515 int ret;
2515 2516
2516 if (iommu_prot & IOMMU_READ) 2517 if (iommu_prot & IOMMU_READ)
2517 prot |= IOMMU_PROT_IR; 2518 prot |= IOMMU_PROT_IR;
2518 if (iommu_prot & IOMMU_WRITE) 2519 if (iommu_prot & IOMMU_WRITE)
2519 prot |= IOMMU_PROT_IW; 2520 prot |= IOMMU_PROT_IW;
2520 2521
2521 return iommu_map_page(domain, iova, paddr, prot, page_size); 2522 mutex_lock(&domain->api_lock);
2523 ret = iommu_map_page(domain, iova, paddr, prot, page_size);
2524 mutex_unlock(&domain->api_lock);
2525
2526 return ret;
2522} 2527}
2523 2528
2524static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, 2529static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
@@ -2528,7 +2533,12 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2528 unsigned long page_size, unmap_size; 2533 unsigned long page_size, unmap_size;
2529 2534
2530 page_size = 0x1000UL << gfp_order; 2535 page_size = 0x1000UL << gfp_order;
2536
2537 mutex_lock(&domain->api_lock);
2531 unmap_size = iommu_unmap_page(domain, iova, page_size); 2538 unmap_size = iommu_unmap_page(domain, iova, page_size);
2539 mutex_unlock(&domain->api_lock);
2540
2541 iommu_flush_tlb_pde(domain);
2532 2542
2533 return get_order(unmap_size); 2543 return get_order(unmap_size);
2534} 2544}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 9dc91b43147..3bacb4d0844 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -19,8 +19,8 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/gfp.h>
23#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
@@ -120,6 +120,7 @@ struct ivmd_header {
120bool amd_iommu_dump; 120bool amd_iommu_dump;
121 121
122static int __initdata amd_iommu_detected; 122static int __initdata amd_iommu_detected;
123static bool __initdata amd_iommu_disabled;
123 124
124u16 amd_iommu_last_bdf; /* largest PCI device id we have 125u16 amd_iommu_last_bdf; /* largest PCI device id we have
125 to handle */ 126 to handle */
@@ -138,9 +139,9 @@ int amd_iommus_present;
138bool amd_iommu_np_cache __read_mostly; 139bool amd_iommu_np_cache __read_mostly;
139 140
140/* 141/*
141 * Set to true if ACPI table parsing and hardware intialization went properly 142 * The ACPI table parsing functions set this variable on an error
142 */ 143 */
143static bool amd_iommu_initialized; 144static int __initdata amd_iommu_init_err;
144 145
145/* 146/*
146 * List of protection domains - used during resume 147 * List of protection domains - used during resume
@@ -391,9 +392,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
391 */ 392 */
392 for (i = 0; i < table->length; ++i) 393 for (i = 0; i < table->length; ++i)
393 checksum += p[i]; 394 checksum += p[i];
394 if (checksum != 0) 395 if (checksum != 0) {
395 /* ACPI table corrupt */ 396 /* ACPI table corrupt */
396 return -ENODEV; 397 amd_iommu_init_err = -ENODEV;
398 return 0;
399 }
397 400
398 p += IVRS_HEADER_LENGTH; 401 p += IVRS_HEADER_LENGTH;
399 402
@@ -436,7 +439,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
436 if (cmd_buf == NULL) 439 if (cmd_buf == NULL)
437 return NULL; 440 return NULL;
438 441
439 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 442 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
440 443
441 return cmd_buf; 444 return cmd_buf;
442} 445}
@@ -472,12 +475,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
472 &entry, sizeof(entry)); 475 &entry, sizeof(entry));
473 476
474 amd_iommu_reset_cmd_buffer(iommu); 477 amd_iommu_reset_cmd_buffer(iommu);
478 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
475} 479}
476 480
477static void __init free_command_buffer(struct amd_iommu *iommu) 481static void __init free_command_buffer(struct amd_iommu *iommu)
478{ 482{
479 free_pages((unsigned long)iommu->cmd_buf, 483 free_pages((unsigned long)iommu->cmd_buf,
480 get_order(iommu->cmd_buf_size)); 484 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
481} 485}
482 486
483/* allocates the memory where the IOMMU will log its events to */ 487/* allocates the memory where the IOMMU will log its events to */
@@ -920,11 +924,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
920 h->mmio_phys); 924 h->mmio_phys);
921 925
922 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 926 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
923 if (iommu == NULL) 927 if (iommu == NULL) {
924 return -ENOMEM; 928 amd_iommu_init_err = -ENOMEM;
929 return 0;
930 }
931
925 ret = init_iommu_one(iommu, h); 932 ret = init_iommu_one(iommu, h);
926 if (ret) 933 if (ret) {
927 return ret; 934 amd_iommu_init_err = ret;
935 return 0;
936 }
928 break; 937 break;
929 default: 938 default:
930 break; 939 break;
@@ -934,8 +943,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
934 } 943 }
935 WARN_ON(p != end); 944 WARN_ON(p != end);
936 945
937 amd_iommu_initialized = true;
938
939 return 0; 946 return 0;
940} 947}
941 948
@@ -1211,6 +1218,10 @@ static int __init amd_iommu_init(void)
1211 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1218 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1212 return -ENODEV; 1219 return -ENODEV;
1213 1220
1221 ret = amd_iommu_init_err;
1222 if (ret)
1223 goto out;
1224
1214 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); 1225 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1215 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); 1226 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1216 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); 1227 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1270,12 +1281,19 @@ static int __init amd_iommu_init(void)
1270 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1281 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1271 goto free; 1282 goto free;
1272 1283
1273 if (!amd_iommu_initialized) 1284 if (amd_iommu_init_err) {
1285 ret = amd_iommu_init_err;
1274 goto free; 1286 goto free;
1287 }
1275 1288
1276 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1289 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1277 goto free; 1290 goto free;
1278 1291
1292 if (amd_iommu_init_err) {
1293 ret = amd_iommu_init_err;
1294 goto free;
1295 }
1296
1279 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1297 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1280 if (ret) 1298 if (ret)
1281 goto free; 1299 goto free;
@@ -1288,6 +1306,8 @@ static int __init amd_iommu_init(void)
1288 if (ret) 1306 if (ret)
1289 goto free; 1307 goto free;
1290 1308
1309 enable_iommus();
1310
1291 if (iommu_pass_through) 1311 if (iommu_pass_through)
1292 ret = amd_iommu_init_passthrough(); 1312 ret = amd_iommu_init_passthrough();
1293 else 1313 else
@@ -1300,8 +1320,6 @@ static int __init amd_iommu_init(void)
1300 1320
1301 amd_iommu_init_notifier(); 1321 amd_iommu_init_notifier();
1302 1322
1303 enable_iommus();
1304
1305 if (iommu_pass_through) 1323 if (iommu_pass_through)
1306 goto out; 1324 goto out;
1307 1325
@@ -1315,6 +1333,7 @@ out:
1315 return ret; 1333 return ret;
1316 1334
1317free: 1335free:
1336 disable_iommus();
1318 1337
1319 amd_iommu_uninit_devices(); 1338 amd_iommu_uninit_devices();
1320 1339
@@ -1354,6 +1373,9 @@ void __init amd_iommu_detect(void)
1354 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 1373 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1355 return; 1374 return;
1356 1375
1376 if (amd_iommu_disabled)
1377 return;
1378
1357 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1379 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1358 iommu_detected = 1; 1380 iommu_detected = 1;
1359 amd_iommu_detected = 1; 1381 amd_iommu_detected = 1;
@@ -1383,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str)
1383 for (; *str; ++str) { 1405 for (; *str; ++str) {
1384 if (strncmp(str, "fullflush", 9) == 0) 1406 if (strncmp(str, "fullflush", 9) == 0)
1385 amd_iommu_unmap_flush = true; 1407 amd_iommu_unmap_flush = true;
1408 if (strncmp(str, "off", 3) == 0)
1409 amd_iommu_disabled = true;
1386 } 1410 }
1387 1411
1388 return 1; 1412 return 1;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 00000000000..a35347501d3
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,785 @@
1/*
2 * apb_timer.c: Driver for Langwell APB timers
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 * Langwell is the south complex of Intel Moorestown MID platform. There are
14 * eight external timers in total that can be used by the operating system.
15 * The timer information, such as frequency and addresses, is provided to the
16 * OS via SFI tables.
17 * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
18 * individual redirection table entries (RTE).
19 * Unlike HPET, there is no master counter, therefore one of the timers are
20 * used as clocksource. The overall allocation looks like:
21 * - timer 0 - NR_CPUs for per cpu timer
22 * - one timer for clocksource
23 * - one timer for watchdog driver.
24 * It is also worth notice that APB timer does not support true one-shot mode,
25 * free-running mode will be used here to emulate one-shot mode.
26 * APB timer can also be used as broadcast timer along with per cpu local APIC
27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */
29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h>
33#include <linux/errno.h>
34#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/slab.h>
37#include <linux/pm.h>
38#include <linux/pci.h>
39#include <linux/sfi.h>
40#include <linux/interrupt.h>
41#include <linux/cpu.h>
42#include <linux/irq.h>
43
44#include <asm/fixmap.h>
45#include <asm/apb_timer.h>
46
47#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150
50#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200
52
53#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
54#define APBT_CLOCKEVENT0_NUM (0)
55#define APBT_CLOCKEVENT1_NUM (1)
56#define APBT_CLOCKSOURCE_NUM (2)
57
58static unsigned long apbt_address;
59static int apb_timer_block_enabled;
60static void __iomem *apbt_virt_address;
61static int phy_cs_timer_id;
62
63/*
64 * Common DW APB timer info
65 */
66static uint64_t apbt_freq;
67
68static void apbt_set_mode(enum clock_event_mode mode,
69 struct clock_event_device *evt);
70static int apbt_next_event(unsigned long delta,
71 struct clock_event_device *evt);
72static cycle_t apbt_read_clocksource(struct clocksource *cs);
73static void apbt_restart_clocksource(struct clocksource *cs);
74
75struct apbt_dev {
76 struct clock_event_device evt;
77 unsigned int num;
78 int cpu;
79 unsigned int irq;
80 unsigned int tick;
81 unsigned int count;
82 unsigned int flags;
83 char name[10];
84};
85
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89
90#ifdef CONFIG_SMP
91static unsigned int apbt_num_timers_used;
92static struct apbt_dev *apbt_devs;
93#endif
94
95static inline unsigned long apbt_readl_reg(unsigned long a)
96{
97 return readl(apbt_virt_address + a);
98}
99
100static inline void apbt_writel_reg(unsigned long d, unsigned long a)
101{
102 writel(d, apbt_virt_address + a);
103}
104
105static inline unsigned long apbt_readl(int n, unsigned long a)
106{
107 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
108}
109
110static inline void apbt_writel(int n, unsigned long d, unsigned long a)
111{
112 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
113}
114
115static inline void apbt_set_mapping(void)
116{
117 struct sfi_timer_table_entry *mtmr;
118
119 if (apbt_virt_address) {
120 pr_debug("APBT base already mapped\n");
121 return;
122 }
123 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
124 if (mtmr == NULL) {
125 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
126 APBT_CLOCKEVENT0_NUM);
127 return;
128 }
129 apbt_address = (unsigned long)mtmr->phys_addr;
130 if (!apbt_address) {
131 printk(KERN_WARNING "No timer base from SFI, use default\n");
132 apbt_address = APBT_DEFAULT_BASE;
133 }
134 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
135 if (apbt_virt_address) {
136 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
137 (void *)apbt_address, (void *)apbt_virt_address);
138 } else {
139 pr_debug("Failed mapping APBT phy address at %p\n",\
140 (void *)apbt_address);
141 goto panic_noapbt;
142 }
143 apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
144 sfi_free_mtmr(mtmr);
145
146 /* Now figure out the physical timer id for clocksource device */
147 mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
148 if (mtmr == NULL)
149 goto panic_noapbt;
150
151 /* Now figure out the physical timer id */
152 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
153 / APBTMRS_REG_SIZE;
154 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
155 return;
156
157panic_noapbt:
158 panic("Failed to setup APB system timer\n");
159
160}
161
162static inline void apbt_clear_mapping(void)
163{
164 iounmap(apbt_virt_address);
165 apbt_virt_address = NULL;
166}
167
168/*
169 * APBT timer interrupt enable / disable
170 */
171static inline int is_apbt_capable(void)
172{
173 return apbt_virt_address ? 1 : 0;
174}
175
176static struct clocksource clocksource_apbt = {
177 .name = "apbt",
178 .rating = APBT_CLOCKSOURCE_RATING,
179 .read = apbt_read_clocksource,
180 .mask = APBT_MASK,
181 .shift = APBT_SHIFT,
182 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
183 .resume = apbt_restart_clocksource,
184};
185
186/* boot APB clock event device */
187static struct clock_event_device apbt_clockevent = {
188 .name = "apbt0",
189 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
190 .set_mode = apbt_set_mode,
191 .set_next_event = apbt_next_event,
192 .shift = APBT_SHIFT,
193 .irq = 0,
194 .rating = APBT_CLOCKEVENT_RATING,
195};
196
197/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0.
223 */
224static void apbt_start_counter(int n)
225{
226 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
227
228 ctrl &= ~APBTMR_CONTROL_ENABLE;
229 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
230 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
231 /* enable, mask interrupt */
232 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
233 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
234 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
235 /* read it once to get cached counter value initialized */
236 apbt_read_clocksource(&clocksource_apbt);
237}
238
239static irqreturn_t apbt_interrupt_handler(int irq, void *data)
240{
241 struct apbt_dev *dev = (struct apbt_dev *)data;
242 struct clock_event_device *aevt = &dev->evt;
243
244 if (!aevt->event_handler) {
245 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
246 dev->num);
247 return IRQ_NONE;
248 }
249 aevt->event_handler(aevt);
250 return IRQ_HANDLED;
251}
252
253static void apbt_restart_clocksource(struct clocksource *cs)
254{
255 apbt_start_counter(phy_cs_timer_id);
256}
257
258/* Setup IRQ routing via IOAPIC */
259#ifdef CONFIG_SMP
260static void apbt_setup_irq(struct apbt_dev *adev)
261{
262 struct irq_chip *chip;
263 struct irq_desc *desc;
264
265 /* timer0 irq has been setup early */
266 if (adev->irq == 0)
267 return;
268 desc = irq_to_desc(adev->irq);
269 chip = get_irq_chip(adev->irq);
270 disable_irq(adev->irq);
271 desc->status |= IRQ_MOVE_PCNTXT;
272 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
273 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
274 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
275 enable_irq(adev->irq);
276 if (system_state == SYSTEM_BOOTING)
277 if (request_irq(adev->irq, apbt_interrupt_handler,
278 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
279 adev->name, adev)) {
280 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
281 adev->num);
282 }
283}
284#endif
285
286static void apbt_enable_int(int n)
287{
288 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
289 /* clear pending intr */
290 apbt_readl(n, APBTMR_N_EOI);
291 ctrl &= ~APBTMR_CONTROL_INT;
292 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
293}
294
295static void apbt_disable_int(int n)
296{
297 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
298
299 ctrl |= APBTMR_CONTROL_INT;
300 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
301}
302
303
304static int __init apbt_clockevent_register(void)
305{
306 struct sfi_timer_table_entry *mtmr;
307 struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
308
309 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
310 if (mtmr == NULL) {
311 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
312 APBT_CLOCKEVENT0_NUM);
313 return -ENODEV;
314 }
315
316 /*
317 * We need to calculate the scaled math multiplication factor for
318 * nanosecond to apbt tick conversion.
319 * mult = (nsec/cycle)*2^APBT_SHIFT
320 */
321 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
322 , NSEC_PER_SEC, APBT_SHIFT);
323
324 /* Calculate the min / max delta */
325 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
326 &apbt_clockevent);
327 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
328 APBT_MIN_DELTA_USEC*apbt_freq,
329 &apbt_clockevent);
330 /*
331 * Start apbt with the boot cpu mask and make it
332 * global if not used for per cpu timer.
333 */
334 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
335 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337
338 if (disable_apbt_percpu) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n",
342 global_clock_event->name);
343 }
344
345 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
346 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
347 apbt_clockevent.name, adev)) {
348 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
349 apbt_clockevent.irq);
350 }
351
352 clockevents_register_device(&adev->evt);
353 /* Start APBT 0 interrupts */
354 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
355
356 sfi_free_mtmr(mtmr);
357 return 0;
358}
359
360#ifdef CONFIG_SMP
361/* Should be called with per cpu */
362void apbt_setup_secondary_clock(void)
363{
364 struct apbt_dev *adev;
365 struct clock_event_device *aevt;
366 int cpu;
367
368 /* Don't register boot CPU clockevent */
369 cpu = smp_processor_id();
370 if (cpu == boot_cpu_id)
371 return;
372 /*
373 * We need to calculate the scaled math multiplication factor for
374 * nanosecond to apbt tick conversion.
375 * mult = (nsec/cycle)*2^APBT_SHIFT
376 */
377 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
378 adev = &per_cpu(cpu_apbt_dev, cpu);
379 aevt = &adev->evt;
380
381 memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
382 aevt->cpumask = cpumask_of(cpu);
383 aevt->name = adev->name;
384 aevt->mode = CLOCK_EVT_MODE_UNUSED;
385
386 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
387 cpu, aevt->name, *(u32 *)aevt->cpumask);
388
389 apbt_setup_irq(adev);
390
391 clockevents_register_device(aevt);
392
393 apbt_enable_int(cpu);
394
395 return;
396}
397
398/*
399 * this notify handler process CPU hotplug events. in case of S0i3, nonboot
400 * cpus are disabled/enabled frequently, for performance reasons, we keep the
401 * per cpu timer irq registered so that we do need to do free_irq/request_irq.
402 *
403 * TODO: it might be more reliable to directly disable percpu clockevent device
404 * without the notifier chain. currently, cpu 0 may get interrupts from other
405 * cpu timers during the offline process due to the ordering of notification.
406 * the extra interrupt is harmless.
407 */
408static int apbt_cpuhp_notify(struct notifier_block *n,
409 unsigned long action, void *hcpu)
410{
411 unsigned long cpu = (unsigned long)hcpu;
412 struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
413
414 switch (action & 0xf) {
415 case CPU_DEAD:
416 apbt_disable_int(cpu);
417 if (system_state == SYSTEM_RUNNING)
418 pr_debug("skipping APBT CPU %lu offline\n", cpu);
419 else if (adev) {
420 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
421 free_irq(adev->irq, adev);
422 }
423 break;
424 default:
425 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
426 }
427 return NOTIFY_OK;
428}
429
430static __init int apbt_late_init(void)
431{
432 if (disable_apbt_percpu || !apb_timer_block_enabled)
433 return 0;
434 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20);
436 return 0;
437}
438fs_initcall(apbt_late_init);
439#else
440
441void apbt_setup_secondary_clock(void) {}
442
443#endif /* CONFIG_SMP */
444
445static void apbt_set_mode(enum clock_event_mode mode,
446 struct clock_event_device *evt)
447{
448 unsigned long ctrl;
449 uint64_t delta;
450 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452
453 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode);
456
457 switch (mode) {
458 case CLOCK_EVT_MODE_PERIODIC:
459 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
460 delta >>= apbt_clockevent.shift;
461 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
462 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
463 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
464 /*
465 * DW APB p. 46, have to disable timer before load counter,
466 * may cause sync problem.
467 */
468 ctrl &= ~APBTMR_CONTROL_ENABLE;
469 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
470 udelay(1);
471 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
472 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
473 ctrl |= APBTMR_CONTROL_ENABLE;
474 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
475 break;
476 /* APB timer does not have one-shot mode, use free running mode */
477 case CLOCK_EVT_MODE_ONESHOT:
478 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
479 /*
480 * set free running mode, this mode will let timer reload max
481 * timeout which will give time (3min on 25MHz clock) to rearm
482 * the next event, therefore emulate the one-shot mode.
483 */
484 ctrl &= ~APBTMR_CONTROL_ENABLE;
485 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
486
487 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
488 /* write again to set free running mode */
489 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
490
491 /*
492 * DW APB p. 46, load counter with all 1s before starting free
493 * running mode.
494 */
495 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
496 ctrl &= ~APBTMR_CONTROL_INT;
497 ctrl |= APBTMR_CONTROL_ENABLE;
498 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
499 break;
500
501 case CLOCK_EVT_MODE_UNUSED:
502 case CLOCK_EVT_MODE_SHUTDOWN:
503 apbt_disable_int(timer_num);
504 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
505 ctrl &= ~APBTMR_CONTROL_ENABLE;
506 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
507 break;
508
509 case CLOCK_EVT_MODE_RESUME:
510 apbt_enable_int(timer_num);
511 break;
512 }
513}
514
515static int apbt_next_event(unsigned long delta,
516 struct clock_event_device *evt)
517{
518 unsigned long ctrl;
519 int timer_num;
520
521 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
522
523 timer_num = adev->num;
524 /* Disable timer */
525 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
526 ctrl &= ~APBTMR_CONTROL_ENABLE;
527 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
528 /* write new count */
529 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
530 ctrl |= APBTMR_CONTROL_ENABLE;
531 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
532 return 0;
533}
534
535/*
536 * APB timer clock is not in sync with pclk on Langwell, which translates to
537 * unreliable read value caused by sampling error. the error does not add up
538 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
539 * would go backwards. the following code is trying to prevent time traveling
540 * backwards. little bit paranoid.
541 */
542static cycle_t apbt_read_clocksource(struct clocksource *cs)
543{
544 unsigned long t0, t1, t2;
545 static unsigned long last_read;
546
547bad_count:
548 t1 = apbt_readl(phy_cs_timer_id,
549 APBTMR_N_CURRENT_VALUE);
550 t2 = apbt_readl(phy_cs_timer_id,
551 APBTMR_N_CURRENT_VALUE);
552 if (unlikely(t1 < t2)) {
553 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
554 t1, t2, t2 - t1);
555 goto bad_count;
556 }
557 /*
558 * check against cached last read, makes sure time does not go back.
559 * it could be a normal rollover but we will do tripple check anyway
560 */
561 if (unlikely(t2 > last_read)) {
562 /* check if we have a normal rollover */
563 unsigned long raw_intr_status =
564 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
565 /*
566 * cs timer interrupt is masked but raw intr bit is set if
567 * rollover occurs. then we read EOI reg to clear it.
568 */
569 if (raw_intr_status & (1 << phy_cs_timer_id)) {
570 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
571 goto out;
572 }
573 pr_debug("APB CS going back %lx:%lx:%lx ",
574 t2, last_read, t2 - last_read);
575bad_count_x3:
576 pr_debug(KERN_INFO "tripple check enforced\n");
577 t0 = apbt_readl(phy_cs_timer_id,
578 APBTMR_N_CURRENT_VALUE);
579 udelay(1);
580 t1 = apbt_readl(phy_cs_timer_id,
581 APBTMR_N_CURRENT_VALUE);
582 udelay(1);
583 t2 = apbt_readl(phy_cs_timer_id,
584 APBTMR_N_CURRENT_VALUE);
585 if ((t2 > t1) || (t1 > t0)) {
586 printk(KERN_ERR "Error: APB CS tripple check failed\n");
587 goto bad_count_x3;
588 }
589 }
590out:
591 last_read = t2;
592 return (cycle_t)~t2;
593}
594
595static int apbt_clocksource_register(void)
596{
597 u64 start, now;
598 cycle_t t1;
599
600 /* Start the counter, use timer 2 as source, timer 0/1 for event */
601 apbt_start_counter(phy_cs_timer_id);
602
603 /* Verify whether apbt counter works */
604 t1 = apbt_read_clocksource(&clocksource_apbt);
605 rdtscll(start);
606
607 /*
608 * We don't know the TSC frequency yet, but waiting for
609 * 200000 TSC cycles is safe:
610 * 4 GHz == 50us
611 * 1 GHz == 200us
612 */
613 do {
614 rep_nop();
615 rdtscll(now);
616 } while ((now - start) < 200000UL);
617
618 /* APBT is the only always on clocksource, it has to work! */
619 if (t1 == apbt_read_clocksource(&clocksource_apbt))
620 panic("APBT counter not counting. APBT disabled\n");
621
622 /*
623 * initialize and register APBT clocksource
624 * convert that to ns/clock cycle
625 * mult = (ns/c) * 2^APBT_SHIFT
626 */
627 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
628 (unsigned long) apbt_freq, APBT_SHIFT);
629 clocksource_register(&clocksource_apbt);
630
631 return 0;
632}
633
634/*
635 * Early setup the APBT timer, only use timer 0 for booting then switch to
636 * per CPU timer if possible.
637 * returns 1 if per cpu apbt is setup
638 * returns 0 if no per cpu apbt is chosen
639 * panic if set up failed, this is the only platform timer on Moorestown.
640 */
641void __init apbt_time_init(void)
642{
643#ifdef CONFIG_SMP
644 int i;
645 struct sfi_timer_table_entry *p_mtmr;
646 unsigned int percpu_timer;
647 struct apbt_dev *adev;
648#endif
649
650 if (apb_timer_block_enabled)
651 return;
652 apbt_set_mapping();
653 if (apbt_virt_address) {
654 pr_debug("Found APBT version 0x%lx\n",\
655 apbt_readl_reg(APBTMRS_COMP_VERSION));
656 } else
657 goto out_noapbt;
658 /*
659 * Read the frequency and check for a sane value, for ESL model
660 * we extend the possible clock range to allow time scaling.
661 */
662
663 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
664 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
665 goto out_noapbt;
666 }
667 if (apbt_clocksource_register()) {
668 pr_debug("APBT has failed to register clocksource\n");
669 goto out_noapbt;
670 }
671 if (!apbt_clockevent_register())
672 apb_timer_block_enabled = 1;
673 else {
674 pr_debug("APBT has failed to register clockevent\n");
675 goto out_noapbt;
676 }
677#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return;
682 }
683 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
684 if (num_possible_cpus() <= sfi_mtimer_num) {
685 percpu_timer = 1;
686 apbt_num_timers_used = num_possible_cpus();
687 } else {
688 percpu_timer = 0;
689 apbt_num_timers_used = 1;
690 adev = &per_cpu(cpu_apbt_dev, 0);
691 adev->flags &= ~APBT_DEV_USED;
692 }
693 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
694
695 /* here we set up per CPU timer data structure */
696 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
697 GFP_KERNEL);
698 if (!apbt_devs) {
699 printk(KERN_ERR "Failed to allocate APB timer devices\n");
700 return;
701 }
702 for (i = 0; i < apbt_num_timers_used; i++) {
703 adev = &per_cpu(cpu_apbt_dev, i);
704 adev->num = i;
705 adev->cpu = i;
706 p_mtmr = sfi_get_mtmr(i);
707 if (p_mtmr) {
708 adev->tick = p_mtmr->freq_hz;
709 adev->irq = p_mtmr->irq;
710 } else
711 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
712 adev->count = 0;
713 sprintf(adev->name, "apbt%d", i);
714 }
715#endif
716
717 return;
718
719out_noapbt:
720 apbt_clear_mapping();
721 apb_timer_block_enabled = 0;
722 panic("failed to enable APB timer\n");
723}
724
725static inline void apbt_disable(int n)
726{
727 if (is_apbt_capable()) {
728 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
729 ctrl &= ~APBTMR_CONTROL_ENABLE;
730 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
731 }
732}
733
734/* called before apb_timer_enable, use early map */
735unsigned long apbt_quick_calibrate()
736{
737 int i, scale;
738 u64 old, new;
739 cycle_t t1, t2;
740 unsigned long khz = 0;
741 u32 loop, shift;
742
743 apbt_set_mapping();
744 apbt_start_counter(phy_cs_timer_id);
745
746 /* check if the timer can count down, otherwise return */
747 old = apbt_read_clocksource(&clocksource_apbt);
748 i = 10000;
749 while (--i) {
750 if (old != apbt_read_clocksource(&clocksource_apbt))
751 break;
752 }
753 if (!i)
754 goto failed;
755
756 /* count 16 ms */
757 loop = (apbt_freq * 1000) << 4;
758
759 /* restart the timer to ensure it won't get to 0 in the calibration */
760 apbt_start_counter(phy_cs_timer_id);
761
762 old = apbt_read_clocksource(&clocksource_apbt);
763 old += loop;
764
765 t1 = __native_read_tsc();
766
767 do {
768 new = apbt_read_clocksource(&clocksource_apbt);
769 } while (new < old);
770
771 t2 = __native_read_tsc();
772
773 shift = 5;
774 if (unlikely(loop >> shift == 0)) {
775 printk(KERN_INFO
776 "APBT TSC calibration failed, not enough resolution\n");
777 return 0;
778 }
779 scale = (int)div_u64((t2 - t1), loop >> shift);
780 khz = (scale * apbt_freq * 1000) >> shift;
781 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
782 return khz;
783failed:
784 return 0;
785}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index f147a95fd84..b5d8b0bcf23 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -31,7 +31,6 @@
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33int gart_iommu_aperture; 33int gart_iommu_aperture;
34EXPORT_SYMBOL_GPL(gart_iommu_aperture);
35int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
36int gart_iommu_aperture_allowed __initdata; 35int gart_iommu_aperture_allowed __initdata;
37 36
@@ -394,6 +393,7 @@ void __init gart_iommu_hole_init(void)
394 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
395 int bus; 394 int bus;
396 int dev_base, dev_limit; 395 int dev_base, dev_limit;
396 u32 ctl;
397 397
398 bus = bus_dev_ranges[i].bus; 398 bus = bus_dev_ranges[i].bus;
399 dev_base = bus_dev_ranges[i].dev_base; 399 dev_base = bus_dev_ranges[i].dev_base;
@@ -407,7 +407,19 @@ void __init gart_iommu_hole_init(void)
407 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
408 x86_init.iommu.iommu_init = gart_iommu_init; 408 x86_init.iommu.iommu_init = gart_iommu_init;
409 409
410 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 410 ctl = read_pci_config(bus, slot, 3,
411 AMD64_GARTAPERTURECTL);
412
413 /*
414 * Before we do anything else disable the GART. It may
415 * still be enabled if we boot into a crash-kernel here.
416 * Reconfiguring the GART while it is enabled could have
417 * unknown side-effects.
418 */
419 ctl &= ~GARTEN;
420 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
421
422 aper_order = (ctl >> 1) & 7;
411 aper_size = (32 * 1024 * 1024) << aper_order; 423 aper_size = (32 * 1024 * 1024) << aper_order;
412 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 424 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
413 aper_base <<= 25; 425 aper_base <<= 25;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6e29b2a77aa..e5a4a1e0161 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1390,7 +1390,7 @@ void __init enable_IR_x2apic(void)
1390 } 1390 }
1391 1391
1392 local_irq_save(flags); 1392 local_irq_save(flags);
1393 mask_8259A(); 1393 legacy_pic->mask_all();
1394 mask_IO_APIC_setup(ioapic_entries); 1394 mask_IO_APIC_setup(ioapic_entries);
1395 1395
1396 if (dmar_table_init_ret) 1396 if (dmar_table_init_ret)
@@ -1422,7 +1422,7 @@ void __init enable_IR_x2apic(void)
1422nox2apic: 1422nox2apic:
1423 if (!ret) /* IR enabling failed */ 1423 if (!ret) /* IR enabling failed */
1424 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1425 unmask_8259A(); 1425 legacy_pic->restore_mask();
1426 local_irq_restore(flags); 1426 local_irq_restore(flags);
1427 1427
1428out: 1428out:
@@ -1640,8 +1640,10 @@ int __init APIC_init_uniprocessor(void)
1640 } 1640 }
1641#endif 1641#endif
1642 1642
1643#ifndef CONFIG_SMP
1643 enable_IR_x2apic(); 1644 enable_IR_x2apic();
1644 default_setup_apic_routing(); 1645 default_setup_apic_routing();
1646#endif
1645 1647
1646 verify_local_APIC(); 1648 verify_local_APIC();
1647 connect_bsp_APIC(); 1649 connect_bsp_APIC();
@@ -2018,7 +2020,7 @@ static int lapic_resume(struct sys_device *dev)
2018 } 2020 }
2019 2021
2020 mask_IO_APIC_setup(ioapic_entries); 2022 mask_IO_APIC_setup(ioapic_entries);
2021 mask_8259A(); 2023 legacy_pic->mask_all();
2022 } 2024 }
2023 2025
2024 if (x2apic_mode) 2026 if (x2apic_mode)
@@ -2062,7 +2064,7 @@ static int lapic_resume(struct sys_device *dev)
2062 2064
2063 if (intr_remapping_enabled) { 2065 if (intr_remapping_enabled) {
2064 reenable_intr_remapping(x2apic_mode); 2066 reenable_intr_remapping(x2apic_mode);
2065 unmask_8259A(); 2067 legacy_pic->restore_mask();
2066 restore_IO_APIC_setup(ioapic_entries); 2068 restore_IO_APIC_setup(ioapic_entries);
2067 free_ioapic_entries(ioapic_entries); 2069 free_ioapic_entries(ioapic_entries);
2068 } 2070 }
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index e3c3d820c32..09d3b17ce0c 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
223}; 223};
224 224
225/* 225/*
226 * Physflat mode is used when there are more than 8 CPUs on a AMD system. 226 * Physflat mode is used when there are more than 8 CPUs on a system.
227 * We cannot use logical delivery in this case because the mask 227 * We cannot use logical delivery in this case because the mask
228 * overflows, so use physical mode. 228 * overflows, so use physical mode.
229 */ 229 */
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index dd2b5f26464..03ba1b895f5 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -42,6 +42,7 @@
42#include <linux/errno.h> 42#include <linux/errno.h>
43#include <linux/acpi.h> 43#include <linux/acpi.h>
44#include <linux/init.h> 44#include <linux/init.h>
45#include <linux/gfp.h>
45#include <linux/nmi.h> 46#include <linux/nmi.h>
46#include <linux/smp.h> 47#include <linux/smp.h>
47#include <linux/io.h> 48#include <linux/io.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6bdd2c7ead7..eb2789c3f72 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -36,6 +36,7 @@
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */ 38#include <linux/jiffies.h> /* time_after() */
39#include <linux/slab.h>
39#ifdef CONFIG_ACPI 40#ifdef CONFIG_ACPI
40#include <acpi/acpi_bus.h> 41#include <acpi/acpi_bus.h>
41#endif 42#endif
@@ -73,8 +74,8 @@
73 */ 74 */
74int sis_apic_bug = -1; 75int sis_apic_bug = -1;
75 76
76static DEFINE_SPINLOCK(ioapic_lock); 77static DEFINE_RAW_SPINLOCK(ioapic_lock);
77static DEFINE_SPINLOCK(vector_lock); 78static DEFINE_RAW_SPINLOCK(vector_lock);
78 79
79/* 80/*
80 * # of IRQ routing registers 81 * # of IRQ routing registers
@@ -94,8 +95,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
94/* # of MP IRQ source entries */ 95/* # of MP IRQ source entries */
95int mp_irq_entries; 96int mp_irq_entries;
96 97
97/* Number of legacy interrupts */
98static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
99/* GSI interrupts */ 98/* GSI interrupts */
100static int nr_irqs_gsi = NR_IRQS_LEGACY; 99static int nr_irqs_gsi = NR_IRQS_LEGACY;
101 100
@@ -140,33 +139,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 139
141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 140/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
142#ifdef CONFIG_SPARSE_IRQ 141#ifdef CONFIG_SPARSE_IRQ
143static struct irq_cfg irq_cfgx[] = { 142static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
144#else 143#else
145static struct irq_cfg irq_cfgx[NR_IRQS] = { 144static struct irq_cfg irq_cfgx[NR_IRQS];
146#endif 145#endif
147 [0] = { .vector = IRQ0_VECTOR, },
148 [1] = { .vector = IRQ1_VECTOR, },
149 [2] = { .vector = IRQ2_VECTOR, },
150 [3] = { .vector = IRQ3_VECTOR, },
151 [4] = { .vector = IRQ4_VECTOR, },
152 [5] = { .vector = IRQ5_VECTOR, },
153 [6] = { .vector = IRQ6_VECTOR, },
154 [7] = { .vector = IRQ7_VECTOR, },
155 [8] = { .vector = IRQ8_VECTOR, },
156 [9] = { .vector = IRQ9_VECTOR, },
157 [10] = { .vector = IRQ10_VECTOR, },
158 [11] = { .vector = IRQ11_VECTOR, },
159 [12] = { .vector = IRQ12_VECTOR, },
160 [13] = { .vector = IRQ13_VECTOR, },
161 [14] = { .vector = IRQ14_VECTOR, },
162 [15] = { .vector = IRQ15_VECTOR, },
163};
164
165void __init io_apic_disable_legacy(void)
166{
167 nr_legacy_irqs = 0;
168 nr_irqs_gsi = 0;
169}
170 146
171int __init arch_early_irq_init(void) 147int __init arch_early_irq_init(void)
172{ 148{
@@ -176,6 +152,11 @@ int __init arch_early_irq_init(void)
176 int node; 152 int node;
177 int i; 153 int i;
178 154
155 if (!legacy_pic->nr_legacy_irqs) {
156 nr_irqs_gsi = 0;
157 io_apic_irqs = ~0UL;
158 }
159
179 cfg = irq_cfgx; 160 cfg = irq_cfgx;
180 count = ARRAY_SIZE(irq_cfgx); 161 count = ARRAY_SIZE(irq_cfgx);
181 node= cpu_to_node(boot_cpu_id); 162 node= cpu_to_node(boot_cpu_id);
@@ -185,8 +166,14 @@ int __init arch_early_irq_init(void)
185 desc->chip_data = &cfg[i]; 166 desc->chip_data = &cfg[i];
186 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 167 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
187 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 168 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
188 if (i < nr_legacy_irqs) 169 /*
189 cpumask_setall(cfg[i].domain); 170 * For legacy IRQ's, start with assigning irq0 to irq15 to
171 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
172 */
173 if (i < legacy_pic->nr_legacy_irqs) {
174 cfg[i].vector = IRQ0_VECTOR + i;
175 cpumask_set_cpu(0, cfg[i].domain);
176 }
190 } 177 }
191 178
192 return 0; 179 return 0;
@@ -406,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
406 struct irq_pin_list *entry; 393 struct irq_pin_list *entry;
407 unsigned long flags; 394 unsigned long flags;
408 395
409 spin_lock_irqsave(&ioapic_lock, flags); 396 raw_spin_lock_irqsave(&ioapic_lock, flags);
410 for_each_irq_pin(entry, cfg->irq_2_pin) { 397 for_each_irq_pin(entry, cfg->irq_2_pin) {
411 unsigned int reg; 398 unsigned int reg;
412 int pin; 399 int pin;
@@ -415,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
415 reg = io_apic_read(entry->apic, 0x10 + pin*2); 402 reg = io_apic_read(entry->apic, 0x10 + pin*2);
416 /* Is the remote IRR bit set? */ 403 /* Is the remote IRR bit set? */
417 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 404 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
418 spin_unlock_irqrestore(&ioapic_lock, flags); 405 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
419 return true; 406 return true;
420 } 407 }
421 } 408 }
422 spin_unlock_irqrestore(&ioapic_lock, flags); 409 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
423 410
424 return false; 411 return false;
425} 412}
@@ -433,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
433{ 420{
434 union entry_union eu; 421 union entry_union eu;
435 unsigned long flags; 422 unsigned long flags;
436 spin_lock_irqsave(&ioapic_lock, flags); 423 raw_spin_lock_irqsave(&ioapic_lock, flags);
437 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 424 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
438 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 425 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
439 spin_unlock_irqrestore(&ioapic_lock, flags); 426 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
440 return eu.entry; 427 return eu.entry;
441} 428}
442 429
@@ -459,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
459void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 446void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
460{ 447{
461 unsigned long flags; 448 unsigned long flags;
462 spin_lock_irqsave(&ioapic_lock, flags); 449 raw_spin_lock_irqsave(&ioapic_lock, flags);
463 __ioapic_write_entry(apic, pin, e); 450 __ioapic_write_entry(apic, pin, e);
464 spin_unlock_irqrestore(&ioapic_lock, flags); 451 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
465} 452}
466 453
467/* 454/*
@@ -474,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin)
474 unsigned long flags; 461 unsigned long flags;
475 union entry_union eu = { .entry.mask = 1 }; 462 union entry_union eu = { .entry.mask = 1 };
476 463
477 spin_lock_irqsave(&ioapic_lock, flags); 464 raw_spin_lock_irqsave(&ioapic_lock, flags);
478 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 465 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
479 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 466 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
480 spin_unlock_irqrestore(&ioapic_lock, flags); 467 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
481} 468}
482 469
483/* 470/*
@@ -604,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
604 591
605 BUG_ON(!cfg); 592 BUG_ON(!cfg);
606 593
607 spin_lock_irqsave(&ioapic_lock, flags); 594 raw_spin_lock_irqsave(&ioapic_lock, flags);
608 __mask_IO_APIC_irq(cfg); 595 __mask_IO_APIC_irq(cfg);
609 spin_unlock_irqrestore(&ioapic_lock, flags); 596 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
610} 597}
611 598
612static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 599static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -614,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
614 struct irq_cfg *cfg = desc->chip_data; 601 struct irq_cfg *cfg = desc->chip_data;
615 unsigned long flags; 602 unsigned long flags;
616 603
617 spin_lock_irqsave(&ioapic_lock, flags); 604 raw_spin_lock_irqsave(&ioapic_lock, flags);
618 __unmask_IO_APIC_irq(cfg); 605 __unmask_IO_APIC_irq(cfg);
619 spin_unlock_irqrestore(&ioapic_lock, flags); 606 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
620} 607}
621 608
622static void mask_IO_APIC_irq(unsigned int irq) 609static void mask_IO_APIC_irq(unsigned int irq)
@@ -865,7 +852,7 @@ static int __init find_isa_irq_apic(int irq, int type)
865 */ 852 */
866static int EISA_ELCR(unsigned int irq) 853static int EISA_ELCR(unsigned int irq)
867{ 854{
868 if (irq < nr_legacy_irqs) { 855 if (irq < legacy_pic->nr_legacy_irqs) {
869 unsigned int port = 0x4d0 + (irq >> 3); 856 unsigned int port = 0x4d0 + (irq >> 3);
870 return (inb(port) >> (irq & 7)) & 1; 857 return (inb(port) >> (irq & 7)) & 1;
871 } 858 }
@@ -1140,12 +1127,12 @@ void lock_vector_lock(void)
1140 /* Used to the online set of cpus does not change 1127 /* Used to the online set of cpus does not change
1141 * during assign_irq_vector. 1128 * during assign_irq_vector.
1142 */ 1129 */
1143 spin_lock(&vector_lock); 1130 raw_spin_lock(&vector_lock);
1144} 1131}
1145 1132
1146void unlock_vector_lock(void) 1133void unlock_vector_lock(void)
1147{ 1134{
1148 spin_unlock(&vector_lock); 1135 raw_spin_unlock(&vector_lock);
1149} 1136}
1150 1137
1151static int 1138static int
@@ -1162,7 +1149,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1162 * Also, we've got to be careful not to trash gate 1149 * Also, we've got to be careful not to trash gate
1163 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1150 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1164 */ 1151 */
1165 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1152 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1153 static int current_offset = VECTOR_OFFSET_START % 8;
1166 unsigned int old_vector; 1154 unsigned int old_vector;
1167 int cpu, err; 1155 int cpu, err;
1168 cpumask_var_t tmp_mask; 1156 cpumask_var_t tmp_mask;
@@ -1198,7 +1186,7 @@ next:
1198 if (vector >= first_system_vector) { 1186 if (vector >= first_system_vector) {
1199 /* If out of vectors on large boxen, must share them. */ 1187 /* If out of vectors on large boxen, must share them. */
1200 offset = (offset + 1) % 8; 1188 offset = (offset + 1) % 8;
1201 vector = FIRST_DEVICE_VECTOR + offset; 1189 vector = FIRST_EXTERNAL_VECTOR + offset;
1202 } 1190 }
1203 if (unlikely(current_vector == vector)) 1191 if (unlikely(current_vector == vector))
1204 continue; 1192 continue;
@@ -1232,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1232 int err; 1220 int err;
1233 unsigned long flags; 1221 unsigned long flags;
1234 1222
1235 spin_lock_irqsave(&vector_lock, flags); 1223 raw_spin_lock_irqsave(&vector_lock, flags);
1236 err = __assign_irq_vector(irq, cfg, mask); 1224 err = __assign_irq_vector(irq, cfg, mask);
1237 spin_unlock_irqrestore(&vector_lock, flags); 1225 raw_spin_unlock_irqrestore(&vector_lock, flags);
1238 return err; 1226 return err;
1239} 1227}
1240 1228
@@ -1268,14 +1256,27 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1268void __setup_vector_irq(int cpu) 1256void __setup_vector_irq(int cpu)
1269{ 1257{
1270 /* Initialize vector_irq on a new cpu */ 1258 /* Initialize vector_irq on a new cpu */
1271 /* This function must be called with vector_lock held */
1272 int irq, vector; 1259 int irq, vector;
1273 struct irq_cfg *cfg; 1260 struct irq_cfg *cfg;
1274 struct irq_desc *desc; 1261 struct irq_desc *desc;
1275 1262
1263 /*
1264 * vector_lock will make sure that we don't run into irq vector
1265 * assignments that might be happening on another cpu in parallel,
1266 * while we setup our initial vector to irq mappings.
1267 */
1268 raw_spin_lock(&vector_lock);
1276 /* Mark the inuse vectors */ 1269 /* Mark the inuse vectors */
1277 for_each_irq_desc(irq, desc) { 1270 for_each_irq_desc(irq, desc) {
1278 cfg = desc->chip_data; 1271 cfg = desc->chip_data;
1272
1273 /*
1274 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1275 * will be part of the irq_cfg's domain.
1276 */
1277 if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
1278 cpumask_set_cpu(cpu, cfg->domain);
1279
1279 if (!cpumask_test_cpu(cpu, cfg->domain)) 1280 if (!cpumask_test_cpu(cpu, cfg->domain))
1280 continue; 1281 continue;
1281 vector = cfg->vector; 1282 vector = cfg->vector;
@@ -1291,6 +1292,7 @@ void __setup_vector_irq(int cpu)
1291 if (!cpumask_test_cpu(cpu, cfg->domain)) 1292 if (!cpumask_test_cpu(cpu, cfg->domain))
1292 per_cpu(vector_irq, cpu)[vector] = -1; 1293 per_cpu(vector_irq, cpu)[vector] = -1;
1293 } 1294 }
1295 raw_spin_unlock(&vector_lock);
1294} 1296}
1295 1297
1296static struct irq_chip ioapic_chip; 1298static struct irq_chip ioapic_chip;
@@ -1440,6 +1442,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1440 1442
1441 cfg = desc->chip_data; 1443 cfg = desc->chip_data;
1442 1444
1445 /*
1446 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1447 * controllers like 8259. Now that IO-APIC can handle this irq, update
1448 * the cfg->domain.
1449 */
1450 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1451 apic->vector_allocation_domain(0, cfg->domain);
1452
1443 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1453 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1444 return; 1454 return;
1445 1455
@@ -1461,8 +1471,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1461 } 1471 }
1462 1472
1463 ioapic_register_intr(irq, desc, trigger); 1473 ioapic_register_intr(irq, desc, trigger);
1464 if (irq < nr_legacy_irqs) 1474 if (irq < legacy_pic->nr_legacy_irqs)
1465 disable_8259A_irq(irq); 1475 legacy_pic->chip->mask(irq);
1466 1476
1467 ioapic_write_entry(apic_id, pin, entry); 1477 ioapic_write_entry(apic_id, pin, entry);
1468} 1478}
@@ -1473,7 +1483,7 @@ static struct {
1473 1483
1474static void __init setup_IO_APIC_irqs(void) 1484static void __init setup_IO_APIC_irqs(void)
1475{ 1485{
1476 int apic_id = 0, pin, idx, irq; 1486 int apic_id, pin, idx, irq;
1477 int notcon = 0; 1487 int notcon = 0;
1478 struct irq_desc *desc; 1488 struct irq_desc *desc;
1479 struct irq_cfg *cfg; 1489 struct irq_cfg *cfg;
@@ -1481,14 +1491,7 @@ static void __init setup_IO_APIC_irqs(void)
1481 1491
1482 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1492 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1483 1493
1484#ifdef CONFIG_ACPI 1494 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1485 if (!acpi_disabled && acpi_ioapic) {
1486 apic_id = mp_find_ioapic(0);
1487 if (apic_id < 0)
1488 apic_id = 0;
1489 }
1490#endif
1491
1492 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1495 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1493 idx = find_irq_entry(apic_id, pin, mp_INT); 1496 idx = find_irq_entry(apic_id, pin, mp_INT);
1494 if (idx == -1) { 1497 if (idx == -1) {
@@ -1510,6 +1513,9 @@ static void __init setup_IO_APIC_irqs(void)
1510 1513
1511 irq = pin_2_irq(idx, apic_id, pin); 1514 irq = pin_2_irq(idx, apic_id, pin);
1512 1515
1516 if ((apic_id > 0) && (irq > 16))
1517 continue;
1518
1513 /* 1519 /*
1514 * Skip the timer IRQ if there's a quirk handler 1520 * Skip the timer IRQ if there's a quirk handler
1515 * installed and if it returns 1: 1521 * installed and if it returns 1:
@@ -1539,6 +1545,56 @@ static void __init setup_IO_APIC_irqs(void)
1539} 1545}
1540 1546
1541/* 1547/*
1548 * for the gsit that is not in first ioapic
1549 * but could not use acpi_register_gsi()
1550 * like some special sci in IBM x3330
1551 */
1552void setup_IO_APIC_irq_extra(u32 gsi)
1553{
1554 int apic_id = 0, pin, idx, irq;
1555 int node = cpu_to_node(boot_cpu_id);
1556 struct irq_desc *desc;
1557 struct irq_cfg *cfg;
1558
1559 /*
1560 * Convert 'gsi' to 'ioapic.pin'.
1561 */
1562 apic_id = mp_find_ioapic(gsi);
1563 if (apic_id < 0)
1564 return;
1565
1566 pin = mp_find_ioapic_pin(apic_id, gsi);
1567 idx = find_irq_entry(apic_id, pin, mp_INT);
1568 if (idx == -1)
1569 return;
1570
1571 irq = pin_2_irq(idx, apic_id, pin);
1572#ifdef CONFIG_SPARSE_IRQ
1573 desc = irq_to_desc(irq);
1574 if (desc)
1575 return;
1576#endif
1577 desc = irq_to_desc_alloc_node(irq, node);
1578 if (!desc) {
1579 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1580 return;
1581 }
1582
1583 cfg = desc->chip_data;
1584 add_pin_to_irq_node(cfg, node, apic_id, pin);
1585
1586 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1587 pr_debug("Pin %d-%d already programmed\n",
1588 mp_ioapics[apic_id].apicid, pin);
1589 return;
1590 }
1591 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1592
1593 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1594 irq_trigger(idx), irq_polarity(idx));
1595}
1596
1597/*
1542 * Set up the timer pin, possibly with the 8259A-master behind. 1598 * Set up the timer pin, possibly with the 8259A-master behind.
1543 */ 1599 */
1544static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, 1600static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1601,14 +1657,14 @@ __apicdebuginit(void) print_IO_APIC(void)
1601 1657
1602 for (apic = 0; apic < nr_ioapics; apic++) { 1658 for (apic = 0; apic < nr_ioapics; apic++) {
1603 1659
1604 spin_lock_irqsave(&ioapic_lock, flags); 1660 raw_spin_lock_irqsave(&ioapic_lock, flags);
1605 reg_00.raw = io_apic_read(apic, 0); 1661 reg_00.raw = io_apic_read(apic, 0);
1606 reg_01.raw = io_apic_read(apic, 1); 1662 reg_01.raw = io_apic_read(apic, 1);
1607 if (reg_01.bits.version >= 0x10) 1663 if (reg_01.bits.version >= 0x10)
1608 reg_02.raw = io_apic_read(apic, 2); 1664 reg_02.raw = io_apic_read(apic, 2);
1609 if (reg_01.bits.version >= 0x20) 1665 if (reg_01.bits.version >= 0x20)
1610 reg_03.raw = io_apic_read(apic, 3); 1666 reg_03.raw = io_apic_read(apic, 3);
1611 spin_unlock_irqrestore(&ioapic_lock, flags); 1667 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1612 1668
1613 printk("\n"); 1669 printk("\n");
1614 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1670 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1825,12 +1881,12 @@ __apicdebuginit(void) print_PIC(void)
1825 unsigned int v; 1881 unsigned int v;
1826 unsigned long flags; 1882 unsigned long flags;
1827 1883
1828 if (!nr_legacy_irqs) 1884 if (!legacy_pic->nr_legacy_irqs)
1829 return; 1885 return;
1830 1886
1831 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1887 printk(KERN_DEBUG "\nprinting PIC contents\n");
1832 1888
1833 spin_lock_irqsave(&i8259A_lock, flags); 1889 raw_spin_lock_irqsave(&i8259A_lock, flags);
1834 1890
1835 v = inb(0xa1) << 8 | inb(0x21); 1891 v = inb(0xa1) << 8 | inb(0x21);
1836 printk(KERN_DEBUG "... PIC IMR: %04x\n", v); 1892 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1844,7 +1900,7 @@ __apicdebuginit(void) print_PIC(void)
1844 outb(0x0a,0xa0); 1900 outb(0x0a,0xa0);
1845 outb(0x0a,0x20); 1901 outb(0x0a,0x20);
1846 1902
1847 spin_unlock_irqrestore(&i8259A_lock, flags); 1903 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
1848 1904
1849 printk(KERN_DEBUG "... PIC ISR: %04x\n", v); 1905 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1850 1906
@@ -1903,13 +1959,13 @@ void __init enable_IO_APIC(void)
1903 * The number of IO-APIC IRQ registers (== #pins): 1959 * The number of IO-APIC IRQ registers (== #pins):
1904 */ 1960 */
1905 for (apic = 0; apic < nr_ioapics; apic++) { 1961 for (apic = 0; apic < nr_ioapics; apic++) {
1906 spin_lock_irqsave(&ioapic_lock, flags); 1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1907 reg_01.raw = io_apic_read(apic, 1); 1963 reg_01.raw = io_apic_read(apic, 1);
1908 spin_unlock_irqrestore(&ioapic_lock, flags); 1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1909 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1910 } 1966 }
1911 1967
1912 if (!nr_legacy_irqs) 1968 if (!legacy_pic->nr_legacy_irqs)
1913 return; 1969 return;
1914 1970
1915 for(apic = 0; apic < nr_ioapics; apic++) { 1971 for(apic = 0; apic < nr_ioapics; apic++) {
@@ -1966,7 +2022,7 @@ void disable_IO_APIC(void)
1966 */ 2022 */
1967 clear_IO_APIC(); 2023 clear_IO_APIC();
1968 2024
1969 if (!nr_legacy_irqs) 2025 if (!legacy_pic->nr_legacy_irqs)
1970 return; 2026 return;
1971 2027
1972 /* 2028 /*
@@ -2045,9 +2101,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2045 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 2101 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2046 2102
2047 /* Read the register 0 value */ 2103 /* Read the register 0 value */
2048 spin_lock_irqsave(&ioapic_lock, flags); 2104 raw_spin_lock_irqsave(&ioapic_lock, flags);
2049 reg_00.raw = io_apic_read(apic_id, 0); 2105 reg_00.raw = io_apic_read(apic_id, 0);
2050 spin_unlock_irqrestore(&ioapic_lock, flags); 2106 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2051 2107
2052 old_id = mp_ioapics[apic_id].apicid; 2108 old_id = mp_ioapics[apic_id].apicid;
2053 2109
@@ -2106,16 +2162,16 @@ void __init setup_ioapic_ids_from_mpc(void)
2106 mp_ioapics[apic_id].apicid); 2162 mp_ioapics[apic_id].apicid);
2107 2163
2108 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 2164 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2109 spin_lock_irqsave(&ioapic_lock, flags); 2165 raw_spin_lock_irqsave(&ioapic_lock, flags);
2110 io_apic_write(apic_id, 0, reg_00.raw); 2166 io_apic_write(apic_id, 0, reg_00.raw);
2111 spin_unlock_irqrestore(&ioapic_lock, flags); 2167 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2112 2168
2113 /* 2169 /*
2114 * Sanity check 2170 * Sanity check
2115 */ 2171 */
2116 spin_lock_irqsave(&ioapic_lock, flags); 2172 raw_spin_lock_irqsave(&ioapic_lock, flags);
2117 reg_00.raw = io_apic_read(apic_id, 0); 2173 reg_00.raw = io_apic_read(apic_id, 0);
2118 spin_unlock_irqrestore(&ioapic_lock, flags); 2174 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2119 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2175 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2120 printk("could not set ID!\n"); 2176 printk("could not set ID!\n");
2121 else 2177 else
@@ -2198,15 +2254,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2198 unsigned long flags; 2254 unsigned long flags;
2199 struct irq_cfg *cfg; 2255 struct irq_cfg *cfg;
2200 2256
2201 spin_lock_irqsave(&ioapic_lock, flags); 2257 raw_spin_lock_irqsave(&ioapic_lock, flags);
2202 if (irq < nr_legacy_irqs) { 2258 if (irq < legacy_pic->nr_legacy_irqs) {
2203 disable_8259A_irq(irq); 2259 legacy_pic->chip->mask(irq);
2204 if (i8259A_irq_pending(irq)) 2260 if (legacy_pic->irq_pending(irq))
2205 was_pending = 1; 2261 was_pending = 1;
2206 } 2262 }
2207 cfg = irq_cfg(irq); 2263 cfg = irq_cfg(irq);
2208 __unmask_IO_APIC_irq(cfg); 2264 __unmask_IO_APIC_irq(cfg);
2209 spin_unlock_irqrestore(&ioapic_lock, flags); 2265 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2210 2266
2211 return was_pending; 2267 return was_pending;
2212} 2268}
@@ -2217,9 +2273,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
2217 struct irq_cfg *cfg = irq_cfg(irq); 2273 struct irq_cfg *cfg = irq_cfg(irq);
2218 unsigned long flags; 2274 unsigned long flags;
2219 2275
2220 spin_lock_irqsave(&vector_lock, flags); 2276 raw_spin_lock_irqsave(&vector_lock, flags);
2221 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2277 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2222 spin_unlock_irqrestore(&vector_lock, flags); 2278 raw_spin_unlock_irqrestore(&vector_lock, flags);
2223 2279
2224 return 1; 2280 return 1;
2225} 2281}
@@ -2312,14 +2368,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2312 irq = desc->irq; 2368 irq = desc->irq;
2313 cfg = desc->chip_data; 2369 cfg = desc->chip_data;
2314 2370
2315 spin_lock_irqsave(&ioapic_lock, flags); 2371 raw_spin_lock_irqsave(&ioapic_lock, flags);
2316 ret = set_desc_affinity(desc, mask, &dest); 2372 ret = set_desc_affinity(desc, mask, &dest);
2317 if (!ret) { 2373 if (!ret) {
2318 /* Only the high 8 bits are valid. */ 2374 /* Only the high 8 bits are valid. */
2319 dest = SET_APIC_LOGICAL_ID(dest); 2375 dest = SET_APIC_LOGICAL_ID(dest);
2320 __target_IO_APIC_irq(irq, dest, cfg); 2376 __target_IO_APIC_irq(irq, dest, cfg);
2321 } 2377 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2378 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2379
2324 return ret; 2380 return ret;
2325} 2381}
@@ -2489,6 +2545,9 @@ void irq_force_complete_move(int irq)
2489 struct irq_desc *desc = irq_to_desc(irq); 2545 struct irq_desc *desc = irq_to_desc(irq);
2490 struct irq_cfg *cfg = desc->chip_data; 2546 struct irq_cfg *cfg = desc->chip_data;
2491 2547
2548 if (!cfg)
2549 return;
2550
2492 __irq_complete_move(&desc, cfg->vector); 2551 __irq_complete_move(&desc, cfg->vector);
2493} 2552}
2494#else 2553#else
@@ -2554,9 +2613,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
2554 irq = desc->irq; 2613 irq = desc->irq;
2555 cfg = desc->chip_data; 2614 cfg = desc->chip_data;
2556 2615
2557 spin_lock_irqsave(&ioapic_lock, flags); 2616 raw_spin_lock_irqsave(&ioapic_lock, flags);
2558 __eoi_ioapic_irq(irq, cfg); 2617 __eoi_ioapic_irq(irq, cfg);
2559 spin_unlock_irqrestore(&ioapic_lock, flags); 2618 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2560} 2619}
2561 2620
2562static void ack_apic_level(unsigned int irq) 2621static void ack_apic_level(unsigned int irq)
@@ -2734,8 +2793,8 @@ static inline void init_IO_APIC_traps(void)
2734 * so default to an old-fashioned 8259 2793 * so default to an old-fashioned 8259
2735 * interrupt if we can.. 2794 * interrupt if we can..
2736 */ 2795 */
2737 if (irq < nr_legacy_irqs) 2796 if (irq < legacy_pic->nr_legacy_irqs)
2738 make_8259A_irq(irq); 2797 legacy_pic->make_irq(irq);
2739 else 2798 else
2740 /* Strange. Oh, well.. */ 2799 /* Strange. Oh, well.. */
2741 desc->chip = &no_irq_chip; 2800 desc->chip = &no_irq_chip;
@@ -2892,7 +2951,7 @@ static inline void __init check_timer(void)
2892 /* 2951 /*
2893 * get/set the timer IRQ vector: 2952 * get/set the timer IRQ vector:
2894 */ 2953 */
2895 disable_8259A_irq(0); 2954 legacy_pic->chip->mask(0);
2896 assign_irq_vector(0, cfg, apic->target_cpus()); 2955 assign_irq_vector(0, cfg, apic->target_cpus());
2897 2956
2898 /* 2957 /*
@@ -2905,7 +2964,7 @@ static inline void __init check_timer(void)
2905 * automatically. 2964 * automatically.
2906 */ 2965 */
2907 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2966 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2908 init_8259A(1); 2967 legacy_pic->init(1);
2909#ifdef CONFIG_X86_32 2968#ifdef CONFIG_X86_32
2910 { 2969 {
2911 unsigned int ver; 2970 unsigned int ver;
@@ -2964,7 +3023,7 @@ static inline void __init check_timer(void)
2964 if (timer_irq_works()) { 3023 if (timer_irq_works()) {
2965 if (nmi_watchdog == NMI_IO_APIC) { 3024 if (nmi_watchdog == NMI_IO_APIC) {
2966 setup_nmi(); 3025 setup_nmi();
2967 enable_8259A_irq(0); 3026 legacy_pic->chip->unmask(0);
2968 } 3027 }
2969 if (disable_timer_pin_1 > 0) 3028 if (disable_timer_pin_1 > 0)
2970 clear_IO_APIC_pin(0, pin1); 3029 clear_IO_APIC_pin(0, pin1);
@@ -2987,14 +3046,14 @@ static inline void __init check_timer(void)
2987 */ 3046 */
2988 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 3047 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
2989 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 3048 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2990 enable_8259A_irq(0); 3049 legacy_pic->chip->unmask(0);
2991 if (timer_irq_works()) { 3050 if (timer_irq_works()) {
2992 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 3051 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2993 timer_through_8259 = 1; 3052 timer_through_8259 = 1;
2994 if (nmi_watchdog == NMI_IO_APIC) { 3053 if (nmi_watchdog == NMI_IO_APIC) {
2995 disable_8259A_irq(0); 3054 legacy_pic->chip->mask(0);
2996 setup_nmi(); 3055 setup_nmi();
2997 enable_8259A_irq(0); 3056 legacy_pic->chip->unmask(0);
2998 } 3057 }
2999 goto out; 3058 goto out;
3000 } 3059 }
@@ -3002,7 +3061,7 @@ static inline void __init check_timer(void)
3002 * Cleanup, just in case ... 3061 * Cleanup, just in case ...
3003 */ 3062 */
3004 local_irq_disable(); 3063 local_irq_disable();
3005 disable_8259A_irq(0); 3064 legacy_pic->chip->mask(0);
3006 clear_IO_APIC_pin(apic2, pin2); 3065 clear_IO_APIC_pin(apic2, pin2);
3007 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 3066 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
3008 } 3067 }
@@ -3021,22 +3080,22 @@ static inline void __init check_timer(void)
3021 3080
3022 lapic_register_intr(0, desc); 3081 lapic_register_intr(0, desc);
3023 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 3082 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
3024 enable_8259A_irq(0); 3083 legacy_pic->chip->unmask(0);
3025 3084
3026 if (timer_irq_works()) { 3085 if (timer_irq_works()) {
3027 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3086 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3028 goto out; 3087 goto out;
3029 } 3088 }
3030 local_irq_disable(); 3089 local_irq_disable();
3031 disable_8259A_irq(0); 3090 legacy_pic->chip->mask(0);
3032 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3091 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3033 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3092 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
3034 3093
3035 apic_printk(APIC_QUIET, KERN_INFO 3094 apic_printk(APIC_QUIET, KERN_INFO
3036 "...trying to set up timer as ExtINT IRQ...\n"); 3095 "...trying to set up timer as ExtINT IRQ...\n");
3037 3096
3038 init_8259A(0); 3097 legacy_pic->init(0);
3039 make_8259A_irq(0); 3098 legacy_pic->make_irq(0);
3040 apic_write(APIC_LVT0, APIC_DM_EXTINT); 3099 apic_write(APIC_LVT0, APIC_DM_EXTINT);
3041 3100
3042 unlock_ExtINT_logic(); 3101 unlock_ExtINT_logic();
@@ -3078,7 +3137,7 @@ void __init setup_IO_APIC(void)
3078 /* 3137 /*
3079 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3138 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3080 */ 3139 */
3081 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; 3140 io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3082 3141
3083 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3142 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3084 /* 3143 /*
@@ -3089,7 +3148,7 @@ void __init setup_IO_APIC(void)
3089 sync_Arb_IDs(); 3148 sync_Arb_IDs();
3090 setup_IO_APIC_irqs(); 3149 setup_IO_APIC_irqs();
3091 init_IO_APIC_traps(); 3150 init_IO_APIC_traps();
3092 if (nr_legacy_irqs) 3151 if (legacy_pic->nr_legacy_irqs)
3093 check_timer(); 3152 check_timer();
3094} 3153}
3095 3154
@@ -3138,13 +3197,13 @@ static int ioapic_resume(struct sys_device *dev)
3138 data = container_of(dev, struct sysfs_ioapic_data, dev); 3197 data = container_of(dev, struct sysfs_ioapic_data, dev);
3139 entry = data->entry; 3198 entry = data->entry;
3140 3199
3141 spin_lock_irqsave(&ioapic_lock, flags); 3200 raw_spin_lock_irqsave(&ioapic_lock, flags);
3142 reg_00.raw = io_apic_read(dev->id, 0); 3201 reg_00.raw = io_apic_read(dev->id, 0);
3143 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 3202 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3144 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 3203 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3145 io_apic_write(dev->id, 0, reg_00.raw); 3204 io_apic_write(dev->id, 0, reg_00.raw);
3146 } 3205 }
3147 spin_unlock_irqrestore(&ioapic_lock, flags); 3206 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3148 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 3207 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
3149 ioapic_write_entry(dev->id, i, entry[i]); 3208 ioapic_write_entry(dev->id, i, entry[i]);
3150 3209
@@ -3207,7 +3266,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3207 if (irq_want < nr_irqs_gsi) 3266 if (irq_want < nr_irqs_gsi)
3208 irq_want = nr_irqs_gsi; 3267 irq_want = nr_irqs_gsi;
3209 3268
3210 spin_lock_irqsave(&vector_lock, flags); 3269 raw_spin_lock_irqsave(&vector_lock, flags);
3211 for (new = irq_want; new < nr_irqs; new++) { 3270 for (new = irq_want; new < nr_irqs; new++) {
3212 desc_new = irq_to_desc_alloc_node(new, node); 3271 desc_new = irq_to_desc_alloc_node(new, node);
3213 if (!desc_new) { 3272 if (!desc_new) {
@@ -3226,14 +3285,11 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3226 irq = new; 3285 irq = new;
3227 break; 3286 break;
3228 } 3287 }
3229 spin_unlock_irqrestore(&vector_lock, flags); 3288 raw_spin_unlock_irqrestore(&vector_lock, flags);
3289
3290 if (irq > 0)
3291 dynamic_irq_init_keep_chip_data(irq);
3230 3292
3231 if (irq > 0) {
3232 dynamic_irq_init(irq);
3233 /* restore it, in case dynamic_irq_init clear it */
3234 if (desc_new)
3235 desc_new->chip_data = cfg_new;
3236 }
3237 return irq; 3293 return irq;
3238} 3294}
3239 3295
@@ -3255,20 +3311,13 @@ int create_irq(void)
3255void destroy_irq(unsigned int irq) 3311void destroy_irq(unsigned int irq)
3256{ 3312{
3257 unsigned long flags; 3313 unsigned long flags;
3258 struct irq_cfg *cfg;
3259 struct irq_desc *desc;
3260 3314
3261 /* store it, in case dynamic_irq_cleanup clear it */ 3315 dynamic_irq_cleanup_keep_chip_data(irq);
3262 desc = irq_to_desc(irq);
3263 cfg = desc->chip_data;
3264 dynamic_irq_cleanup(irq);
3265 /* connect back irq_cfg */
3266 desc->chip_data = cfg;
3267 3316
3268 free_irte(irq); 3317 free_irte(irq);
3269 spin_lock_irqsave(&vector_lock, flags); 3318 raw_spin_lock_irqsave(&vector_lock, flags);
3270 __clear_irq_vector(irq, cfg); 3319 __clear_irq_vector(irq, get_irq_chip_data(irq));
3271 spin_unlock_irqrestore(&vector_lock, flags); 3320 raw_spin_unlock_irqrestore(&vector_lock, flags);
3272} 3321}
3273 3322
3274/* 3323/*
@@ -3805,9 +3854,9 @@ int __init io_apic_get_redir_entries (int ioapic)
3805 union IO_APIC_reg_01 reg_01; 3854 union IO_APIC_reg_01 reg_01;
3806 unsigned long flags; 3855 unsigned long flags;
3807 3856
3808 spin_lock_irqsave(&ioapic_lock, flags); 3857 raw_spin_lock_irqsave(&ioapic_lock, flags);
3809 reg_01.raw = io_apic_read(ioapic, 1); 3858 reg_01.raw = io_apic_read(ioapic, 1);
3810 spin_unlock_irqrestore(&ioapic_lock, flags); 3859 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3811 3860
3812 return reg_01.bits.entries; 3861 return reg_01.bits.entries;
3813} 3862}
@@ -3890,7 +3939,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3890 /* 3939 /*
3891 * IRQs < 16 are already in the irq_2_pin[] map 3940 * IRQs < 16 are already in the irq_2_pin[] map
3892 */ 3941 */
3893 if (irq >= nr_legacy_irqs) { 3942 if (irq >= legacy_pic->nr_legacy_irqs) {
3894 cfg = desc->chip_data; 3943 cfg = desc->chip_data;
3895 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3944 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3896 printk(KERN_INFO "can not add pin %d for irq %d\n", 3945 printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3969,9 +4018,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3969 if (physids_empty(apic_id_map)) 4018 if (physids_empty(apic_id_map))
3970 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); 4019 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3971 4020
3972 spin_lock_irqsave(&ioapic_lock, flags); 4021 raw_spin_lock_irqsave(&ioapic_lock, flags);
3973 reg_00.raw = io_apic_read(ioapic, 0); 4022 reg_00.raw = io_apic_read(ioapic, 0);
3974 spin_unlock_irqrestore(&ioapic_lock, flags); 4023 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3975 4024
3976 if (apic_id >= get_physical_broadcast()) { 4025 if (apic_id >= get_physical_broadcast()) {
3977 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " 4026 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -4005,10 +4054,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
4005 if (reg_00.bits.ID != apic_id) { 4054 if (reg_00.bits.ID != apic_id) {
4006 reg_00.bits.ID = apic_id; 4055 reg_00.bits.ID = apic_id;
4007 4056
4008 spin_lock_irqsave(&ioapic_lock, flags); 4057 raw_spin_lock_irqsave(&ioapic_lock, flags);
4009 io_apic_write(ioapic, 0, reg_00.raw); 4058 io_apic_write(ioapic, 0, reg_00.raw);
4010 reg_00.raw = io_apic_read(ioapic, 0); 4059 reg_00.raw = io_apic_read(ioapic, 0);
4011 spin_unlock_irqrestore(&ioapic_lock, flags); 4060 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4012 4061
4013 /* Sanity check */ 4062 /* Sanity check */
4014 if (reg_00.bits.ID != apic_id) { 4063 if (reg_00.bits.ID != apic_id) {
@@ -4029,9 +4078,9 @@ int __init io_apic_get_version(int ioapic)
4029 union IO_APIC_reg_01 reg_01; 4078 union IO_APIC_reg_01 reg_01;
4030 unsigned long flags; 4079 unsigned long flags;
4031 4080
4032 spin_lock_irqsave(&ioapic_lock, flags); 4081 raw_spin_lock_irqsave(&ioapic_lock, flags);
4033 reg_01.raw = io_apic_read(ioapic, 1); 4082 reg_01.raw = io_apic_read(ioapic, 1);
4034 spin_unlock_irqrestore(&ioapic_lock, flags); 4083 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4035 4084
4036 return reg_01.bits.version; 4085 return reg_01.bits.version;
4037} 4086}
@@ -4063,27 +4112,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4063#ifdef CONFIG_SMP 4112#ifdef CONFIG_SMP
4064void __init setup_ioapic_dest(void) 4113void __init setup_ioapic_dest(void)
4065{ 4114{
4066 int pin, ioapic = 0, irq, irq_entry; 4115 int pin, ioapic, irq, irq_entry;
4067 struct irq_desc *desc; 4116 struct irq_desc *desc;
4068 const struct cpumask *mask; 4117 const struct cpumask *mask;
4069 4118
4070 if (skip_ioapic_setup == 1) 4119 if (skip_ioapic_setup == 1)
4071 return; 4120 return;
4072 4121
4073#ifdef CONFIG_ACPI 4122 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4074 if (!acpi_disabled && acpi_ioapic) {
4075 ioapic = mp_find_ioapic(0);
4076 if (ioapic < 0)
4077 ioapic = 0;
4078 }
4079#endif
4080
4081 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4123 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4082 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4124 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4083 if (irq_entry == -1) 4125 if (irq_entry == -1)
4084 continue; 4126 continue;
4085 irq = pin_2_irq(irq_entry, ioapic, pin); 4127 irq = pin_2_irq(irq_entry, ioapic, pin);
4086 4128
4129 if ((ioapic > 0) && (irq > 16))
4130 continue;
4131
4087 desc = irq_to_desc(irq); 4132 desc = irq_to_desc(irq);
4088 4133
4089 /* 4134 /*
@@ -4268,3 +4313,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4268 4313
4269 nr_ioapics++; 4314 nr_ioapics++;
4270} 4315}
4316
4317/* Enable IOAPIC early just for system timer */
4318void __init pre_init_apic_IRQ0(void)
4319{
4320 struct irq_cfg *cfg;
4321 struct irq_desc *desc;
4322
4323 printk(KERN_INFO "Early APIC setup for system timer0\n");
4324#ifndef CONFIG_SMP
4325 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
4326#endif
4327 desc = irq_to_desc_alloc_node(0, 0);
4328
4329 setup_local_APIC();
4330
4331 cfg = irq_cfg(0);
4332 add_pin_to_irq_node(cfg, 0, 0, 0);
4333 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
4334
4335 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4336}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 0159a69396c..1edaf15c0b8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/slab.h>
21#include <linux/sysdev.h> 22#include <linux/sysdev.h>
22#include <linux/sysctl.h> 23#include <linux/sysctl.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -177,7 +178,7 @@ int __init check_nmi_watchdog(void)
177error: 178error:
178 if (nmi_watchdog == NMI_IO_APIC) { 179 if (nmi_watchdog == NMI_IO_APIC) {
179 if (!timer_through_8259) 180 if (!timer_through_8259)
180 disable_8259A_irq(0); 181 legacy_pic->chip->mask(0);
181 on_each_cpu(__acpi_nmi_disable, NULL, 1); 182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
182 } 183 }
183 184
@@ -416,13 +417,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
416 417
417 /* We can be called before check_nmi_watchdog, hence NULL check. */ 418 /* We can be called before check_nmi_watchdog, hence NULL check. */
418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 419 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 420 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
420 421
421 spin_lock(&lock); 422 raw_spin_lock(&lock);
422 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 423 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
423 show_regs(regs); 424 show_regs(regs);
424 dump_stack(); 425 dump_stack();
425 spin_unlock(&lock); 426 raw_spin_unlock(&lock);
426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 427 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
427 428
428 rc = 1; 429 rc = 1;
@@ -438,8 +439,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
438 * Ayiee, looks like this CPU is stuck ... 439 * Ayiee, looks like this CPU is stuck ...
439 * wait a few IRQs (5 seconds) before doing the oops ... 440 * wait a few IRQs (5 seconds) before doing the oops ...
440 */ 441 */
441 __this_cpu_inc(per_cpu_var(alert_counter)); 442 __this_cpu_inc(alert_counter);
442 if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) 443 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
443 /* 444 /*
444 * die_nmi will return ONLY if NOTIFY_STOP happens.. 445 * die_nmi will return ONLY if NOTIFY_STOP happens..
445 */ 446 */
@@ -447,7 +448,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
447 regs, panic_on_timeout); 448 regs, panic_on_timeout);
448 } else { 449 } else {
449 __get_cpu_var(last_irq_sum) = sum; 450 __get_cpu_var(last_irq_sum) = sum;
450 __this_cpu_write(per_cpu_var(alert_counter), 0); 451 __this_cpu_write(alert_counter, 0);
451 } 452 }
452 453
453 /* see if the nmi watchdog went off */ 454 /* see if the nmi watchdog went off */
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 47dd856708e..3e28401f161 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -277,6 +277,7 @@ static __init void early_check_numaq(void)
277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; 277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; 278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
279 x86_init.timers.tsc_pre_init = numaq_tsc_init; 279 x86_init.timers.tsc_pre_init = numaq_tsc_init;
280 x86_init.pci.init = pci_numaq_init;
280 } 281 }
281} 282}
282 283
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 3740c8a4eae..c085d52dbaf 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -17,6 +17,7 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/timer.h> 19#include <linux/timer.h>
20#include <linux/slab.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/io.h> 23#include <linux/io.h>
@@ -120,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
120unsigned long sn_rtc_cycles_per_second; 121unsigned long sn_rtc_cycles_per_second;
121EXPORT_SYMBOL(sn_rtc_cycles_per_second); 122EXPORT_SYMBOL(sn_rtc_cycles_per_second);
122 123
123/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
124
125static const struct cpumask *uv_target_cpus(void) 124static const struct cpumask *uv_target_cpus(void)
126{ 125{
127 return cpumask_of(0); 126 return cpu_online_mask;
128} 127}
129 128
130static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) 129static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 30f25a75fe2..5de7f4c5697 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -5,7 +5,6 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/acpi.h> 9#include <linux/acpi.h>
11#include <asm/io.h> 10#include <asm/io.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b..870e6cc6ad2 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294..1840c0a5170 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 1b1920fa7c8..459168083b7 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h>
36#include <trace/events/power.h> 37#include <trace/events/power.h>
37 38
38#include <linux/acpi.h> 39#include <linux/acpi.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 006b278b0d5..c587db472a7 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -20,7 +20,6 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22 22
23#include <linux/slab.h>
24#include <linux/delay.h> 23#include <linux/delay.h>
25#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
26 25
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index ac27ec2264d..16e3483be9e 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -80,6 +80,7 @@
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h> 82#include <linux/errno.h>
83#include <linux/slab.h>
83 84
84#include <asm/processor-cyrix.h> 85#include <asm/processor-cyrix.h>
85 86
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index da5f70fcb76..e7b559d74c5 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -9,7 +9,6 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15 14
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 86961519372..7b8a8ba67b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -25,7 +25,6 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h>
29#include <linux/cpumask.h> 28#include <linux/cpumask.h>
30#include <linux/timex.h> 29#include <linux/timex.h>
31 30
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 00000000000..ce7cde713e7
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,621 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle);
410 if (ret)
411 dprintk("probe: _OSC evaluation did not succeed\n");
412 /* Firmware's use of _OSC is optional */
413 ret = 0;
414 }
415
416 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
417 if (ACPI_FAILURE(status))
418 return -ENODEV;
419
420 out_obj = output.pointer;
421 if (out_obj->type != ACPI_TYPE_PACKAGE) {
422 ret = -ENODEV;
423 goto out_free;
424 }
425
426 member = &out_obj->package.elements[0];
427 if (member->type != ACPI_TYPE_BUFFER) {
428 ret = -ENODEV;
429 goto out_free;
430 }
431
432 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
433
434 dprintk("probe: mem_resource descriptor: 0x%x,"
435 " length: %d, space_id: %d, resource_usage: %d,"
436 " type_specific: %d, granularity: 0x%llx,"
437 " minimum: 0x%llx, maximum: 0x%llx,"
438 " translation_offset: 0x%llx, address_length: 0x%llx\n",
439 mem_resource->descriptor, mem_resource->length,
440 mem_resource->space_id, mem_resource->resource_usage,
441 mem_resource->type_specific, mem_resource->granularity,
442 mem_resource->minimum, mem_resource->maximum,
443 mem_resource->translation_offset,
444 mem_resource->address_length);
445
446 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
447 ret = -ENODEV;
448 goto out_free;
449 }
450
451 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
452 mem_resource->address_length);
453 if (pcch_virt_addr == NULL) {
454 dprintk("probe: could not map shared mem region\n");
455 goto out_free;
456 }
457 pcch_hdr = pcch_virt_addr;
458
459 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
460 dprintk("probe: PCCH header is at physical address: 0x%llx,"
461 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
462 " supported features: 0x%x, command field: 0x%x,"
463 " status field: 0x%x, nominal latency: %d us\n",
464 mem_resource->minimum, ioread32(&pcch_hdr->signature),
465 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
466 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
467 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
468 ioread32(&pcch_hdr->latency));
469
470 dprintk("probe: min time between commands: %d us,"
471 " max time between commands: %d us,"
472 " nominal CPU frequency: %d MHz,"
473 " minimum CPU frequency: %d MHz,"
474 " minimum CPU frequency without throttling: %d MHz\n",
475 ioread32(&pcch_hdr->minimum_time),
476 ioread32(&pcch_hdr->maximum_time),
477 ioread32(&pcch_hdr->nominal),
478 ioread32(&pcch_hdr->throttled_frequency),
479 ioread32(&pcch_hdr->minimum_frequency));
480
481 member = &out_obj->package.elements[1];
482 if (member->type != ACPI_TYPE_BUFFER) {
483 ret = -ENODEV;
484 goto pcch_free;
485 }
486
487 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
488
489 doorbell.space_id = reg_resource->space_id;
490 doorbell.bit_width = reg_resource->bit_width;
491 doorbell.bit_offset = reg_resource->bit_offset;
492 doorbell.access_width = 64;
493 doorbell.address = reg_resource->address;
494
495 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
496 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
497 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
498 doorbell.access_width, reg_resource->address);
499
500 member = &out_obj->package.elements[2];
501 if (member->type != ACPI_TYPE_INTEGER) {
502 ret = -ENODEV;
503 goto pcch_free;
504 }
505
506 doorbell_preserve = member->integer.value;
507
508 member = &out_obj->package.elements[3];
509 if (member->type != ACPI_TYPE_INTEGER) {
510 ret = -ENODEV;
511 goto pcch_free;
512 }
513
514 doorbell_write = member->integer.value;
515
516 dprintk("probe: doorbell_preserve: 0x%llx,"
517 " doorbell_write: 0x%llx\n",
518 doorbell_preserve, doorbell_write);
519
520 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
521 if (!pcc_cpu_info) {
522 ret = -ENOMEM;
523 goto pcch_free;
524 }
525
526 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
527 " limits: %d MHz, %d MHz\n", PCC_VERSION,
528 ioread32(&pcch_hdr->minimum_frequency),
529 ioread32(&pcch_hdr->nominal));
530 kfree(output.pointer);
531 return ret;
532pcch_free:
533 pcc_clear_mapping();
534out_free:
535 kfree(output.pointer);
536 return ret;
537}
538
539static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
540{
541 unsigned int cpu = policy->cpu;
542 unsigned int result = 0;
543
544 if (!pcch_virt_addr) {
545 result = -1;
546 goto pcch_null;
547 }
548
549 result = pcc_get_offset(cpu);
550 if (result) {
551 dprintk("init: PCCP evaluation failed\n");
552 goto free;
553 }
554
555 policy->max = policy->cpuinfo.max_freq =
556 ioread32(&pcch_hdr->nominal) * 1000;
557 policy->min = policy->cpuinfo.min_freq =
558 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu);
560
561 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min);
563
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result;
570}
571
572static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
573{
574 return 0;
575}
576
577static struct cpufreq_driver pcc_cpufreq_driver = {
578 .flags = CPUFREQ_CONST_LOOPS,
579 .get = pcc_get_freq,
580 .verify = pcc_cpufreq_verify,
581 .target = pcc_cpufreq_target,
582 .init = pcc_cpufreq_cpu_init,
583 .exit = pcc_cpufreq_cpu_exit,
584 .name = "pcc-cpufreq",
585 .owner = THIS_MODULE,
586};
587
588static int __init pcc_cpufreq_init(void)
589{
590 int ret;
591
592 if (acpi_disabled)
593 return 0;
594
595 ret = pcc_cpufreq_probe();
596 if (ret) {
597 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
598 return ret;
599 }
600
601 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
602
603 return ret;
604}
605
606static void __exit pcc_cpufreq_exit(void)
607{
608 cpufreq_unregister_driver(&pcc_cpufreq_driver);
609
610 pcc_clear_mapping();
611
612 free_percpu(pcc_cpu_info);
613}
614
615MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
616MODULE_VERSION(PCC_VERSION);
617MODULE_DESCRIPTION("Processor Clocking Control interface driver");
618MODULE_LICENSE("GPL");
619
620late_initcall(pcc_cpufreq_init);
621module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index cb01dac267d..b3379d6a5c5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -13,7 +13,6 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/ioport.h> 15#include <linux/ioport.h>
16#include <linux/slab.h>
17#include <linux/timex.h> 16#include <linux/timex.h>
18#include <linux/io.h> 17#include <linux/io.h>
19 18
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6e44519960c..b6215b9798e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 807 unsigned int index)
808{ 808{
809 acpi_integer control; 809 u64 control;
810 810
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 812 return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 824{
825 struct cpufreq_frequency_table *powernow_table; 825 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 826 int ret_val = -ENODEV;
827 acpi_integer control, status; 827 u64 control, status;
828 828
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 830 dprintk("register performance failed: bad ACPI data\n");
@@ -929,7 +929,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
929 powernow_table[i].index = index; 929 powernow_table[i].index = index;
930 930
931 /* Frequency may be rounded for these */ 931 /* Frequency may be rounded for these */
932 if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { 932 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
933 || boot_cpu_data.x86 == 0x11) {
933 powernow_table[i].frequency = 934 powernow_table[i].frequency =
934 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); 935 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
935 } else 936 } else
@@ -948,7 +949,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 949 u32 fid;
949 u32 vid; 950 u32 vid;
950 u32 freq, index; 951 u32 freq, index;
951 acpi_integer status, control; 952 u64 status, control;
952 953
953 if (data->exttype) { 954 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 955 status = data->acpi_data.states[i].status;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 8d672ef162c..9b1ff37de46 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> /* current */ 20#include <linux/sched.h> /* current */
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/gfp.h>
23 24
24#include <asm/msr.h> 25#include <asm/msr.h>
25#include <asm/processor.h> 26#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 2ce8e0b5cc5..561758e9518 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
25#include <linux/pci.h> 25#include <linux/pci.h>
26#include <linux/slab.h>
27#include <linux/sched.h> 26#include <linux/sched.h>
28 27
29#include "speedstep-lib.h" 28#include "speedstep-lib.h"
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index ad0083abfa2..a94ec6be69f 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -13,7 +13,6 @@
13#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/cpufreq.h> 15#include <linux/cpufreq.h>
16#include <linux/slab.h>
17 16
18#include <asm/msr.h> 17#include <asm/msr.h>
19#include <asm/tsc.h> 18#include <asm/tsc.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 04d73c114e4..8abd869baab 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -17,7 +17,6 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/delay.h> 20#include <linux/delay.h>
22#include <linux/io.h> 21#include <linux/io.h>
23#include <asm/ist.h> 22#include <asm/ist.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 879666f4d87..1366c7cfd48 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -47,6 +47,27 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
47 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 47 (c->x86 == 0x6 && c->x86_model >= 0x0e))
48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
49 49
50 /*
51 * Atom erratum AAE44/AAF40/AAG38/AAH41:
52 *
53 * A race condition between speculative fetches and invalidating
54 * a large page. This is worked around in microcode, but we
55 * need the microcode to have already been loaded... so if it is
56 * not, recommend a BIOS update and disable large pages.
57 */
58 if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
59 u32 ucode, junk;
60
61 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
62 sync_core();
63 rdmsr(MSR_IA32_UCODE_REV, junk, ucode);
64
65 if (ucode < 0x20e) {
66 printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
67 clear_cpu_cap(c, X86_FEATURE_PSE);
68 }
69 }
70
50#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
51 set_cpu_cap(c, X86_FEATURE_SYSENTER32); 72 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
52#else 73#else
@@ -70,7 +91,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 91 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 92 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 93 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 sched_clock_stable = 1; 94 if (!check_tsc_unstable())
95 sched_clock_stable = 1;
74 } 96 }
75 97
76 /* 98 /*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index eddb1bdd1b8..b3eeb66c0a5 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -903,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
903 return ret; 903 return ret;
904} 904}
905 905
906static struct sysfs_ops sysfs_ops = { 906static const struct sysfs_ops sysfs_ops = {
907 .show = show, 907 .show = show,
908 .store = store, 908 .store = store,
909}; 909};
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 73734baa50f..e7dbde7bfed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -22,6 +22,7 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/gfp.h>
25#include <asm/mce.h> 26#include <asm/mce.h>
26#include <asm/apic.h> 27#include <asm/apic.h>
27 28
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a8aacd4b513..8a6f0afa767 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/sysfs.h> 27#include <linux/sysfs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
29#include <linux/init.h> 30#include <linux/init.h>
30#include <linux/kmod.h> 31#include <linux/kmod.h>
31#include <linux/poll.h> 32#include <linux/poll.h>
@@ -46,6 +47,13 @@
46 47
47#include "mce-internal.h" 48#include "mce-internal.h"
48 49
50static DEFINE_MUTEX(mce_read_mutex);
51
52#define rcu_dereference_check_mce(p) \
53 rcu_dereference_check((p), \
54 rcu_read_lock_sched_held() || \
55 lockdep_is_held(&mce_read_mutex))
56
49#define CREATE_TRACE_POINTS 57#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h> 58#include <trace/events/mce.h>
51 59
@@ -158,7 +166,7 @@ void mce_log(struct mce *mce)
158 mce->finished = 0; 166 mce->finished = 0;
159 wmb(); 167 wmb();
160 for (;;) { 168 for (;;) {
161 entry = rcu_dereference(mcelog.next); 169 entry = rcu_dereference_check_mce(mcelog.next);
162 for (;;) { 170 for (;;) {
163 /* 171 /*
164 * When the buffer fills up discard new entries. 172 * When the buffer fills up discard new entries.
@@ -1485,8 +1493,6 @@ static void collect_tscs(void *data)
1485 rdtscll(cpu_tsc[smp_processor_id()]); 1493 rdtscll(cpu_tsc[smp_processor_id()]);
1486} 1494}
1487 1495
1488static DEFINE_MUTEX(mce_read_mutex);
1489
1490static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1491 loff_t *off) 1497 loff_t *off)
1492{ 1498{
@@ -1500,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1500 return -ENOMEM; 1506 return -ENOMEM;
1501 1507
1502 mutex_lock(&mce_read_mutex); 1508 mutex_lock(&mce_read_mutex);
1503 next = rcu_dereference(mcelog.next); 1509 next = rcu_dereference_check_mce(mcelog.next);
1504 1510
1505 /* Only supports full reads right now */ 1511 /* Only supports full reads right now */
1506 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1565,7 +1571,7 @@ timeout:
1565static unsigned int mce_poll(struct file *file, poll_table *wait) 1571static unsigned int mce_poll(struct file *file, poll_table *wait)
1566{ 1572{
1567 poll_wait(file, &mce_wait, wait); 1573 poll_wait(file, &mce_wait, wait);
1568 if (rcu_dereference(mcelog.next)) 1574 if (rcu_dereference_check_mce(mcelog.next))
1569 return POLLIN | POLLRDNORM; 1575 return POLLIN | POLLRDNORM;
1570 return 0; 1576 return 0;
1571} 1577}
@@ -2044,6 +2050,7 @@ static __init void mce_init_banks(void)
2044 struct mce_bank *b = &mce_banks[i]; 2050 struct mce_bank *b = &mce_banks[i];
2045 struct sysdev_attribute *a = &b->attr; 2051 struct sysdev_attribute *a = &b->attr;
2046 2052
2053 sysfs_attr_init(&a->attr);
2047 a->attr.name = b->attrname; 2054 a->attr.name = b->attrname;
2048 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2055 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2049 2056
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efc..224392d8fe8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -21,6 +21,7 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/smp.h> 27#include <linux/smp.h>
@@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
388 return ret; 389 return ret;
389} 390}
390 391
391static struct sysfs_ops threshold_ops = { 392static const struct sysfs_ops threshold_ops = {
392 .show = show, 393 .show = show,
393 .store = store, 394 .store = store,
394}; 395};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 7c785634af2..62b48e40920 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -5,6 +5,7 @@
5 * Author: Andi Kleen 5 * Author: Andi Kleen
6 */ 6 */
7 7
8#include <linux/gfp.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/percpu.h> 11#include <linux/percpu.h>
@@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot)
95 96
96 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
97 if (val & CMCI_EN) { 98 if (val & CMCI_EN) {
98 if (test_and_clear_bit(i, owned) || boot) 99 if (test_and_clear_bit(i, owned) && !boot)
99 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
100 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
101 continue; 102 continue;
@@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot)
107 108
108 /* Did the enable bit stick? -- the bank supports CMCI */ 109 /* Did the enable bit stick? -- the bank supports CMCI */
109 if (val & CMCI_EN) { 110 if (val & CMCI_EN) {
110 if (!test_and_set_bit(i, owned) || boot) 111 if (!test_and_set_bit(i, owned) && !boot)
111 print_update("CMCI", &hdr, i); 112 print_update("CMCI", &hdr, i);
112 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 113 __clear_bit(i, __get_cpu_var(mce_poll_banks));
113 } else { 114 } else {
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 09b1698e046..06130b52f01 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173static int __init clean_sort_range(struct res_range *range, int az)
174{
175 int i, j, k = az - 1, nr_range = 0;
176
177 for (i = 0; i < k; i++) {
178 if (range[i].end)
179 continue;
180 for (j = k; j > i; j--) {
181 if (range[j].end) {
182 k = j;
183 break;
184 }
185 }
186 if (j == i)
187 break;
188 range[i].start = range[k].start;
189 range[i].end = range[k].end;
190 range[k].start = 0;
191 range[k].end = 0;
192 k--;
193 }
194 /* count it */
195 for (i = 0; i < az; i++) {
196 if (!range[i].end) {
197 nr_range = i;
198 break;
199 }
200 }
201
202 /* sort them */
203 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
204
205 return nr_range;
206}
207
208#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
209 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
210 64
211static int __init 65static int __init
212x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
213 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
214 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
215{ 69{
@@ -223,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
223 continue; 77 continue;
224 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
225 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
226 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
227 base + size - 1); 81 base, base + size);
228 } 82 }
229 if (debug_print) { 83 if (debug_print) {
230 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
231 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
232 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
233 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
234 } 88 }
235 89
236 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -252,19 +106,19 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
252 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
253 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
254 } 108 }
255 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
256 } 110 }
257 if (extra_remove_size) 111 if (extra_remove_size)
258 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
259 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
260 114
261 if (debug_print) { 115 if (debug_print) {
262 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
263 for (i = 0; i < RANGE_NUM; i++) { 117 for (i = 0; i < RANGE_NUM; i++) {
264 if (!range[i].end) 118 if (!range[i].end)
265 continue; 119 continue;
266 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
267 range[i].start, range[i].end + 1); 121 range[i].start, range[i].end);
268 } 122 }
269 } 123 }
270 124
@@ -273,26 +127,22 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
273 if (debug_print) { 127 if (debug_print) {
274 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
275 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
276 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
277 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
278 } 132 }
279 133
280 /* clear those is not used */
281 for (i = nr_range; i < RANGE_NUM; i++)
282 memset(&range[i], 0, sizeof(range[i]));
283
284 return nr_range; 134 return nr_range;
285} 135}
286 136
287#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
288 138
289static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
290{ 140{
291 unsigned long sum = 0; 141 unsigned long sum = 0;
292 int i; 142 int i;
293 143
294 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
295 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
296 146
297 return sum; 147 return sum;
298} 148}
@@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
621early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
622 472
623static int __init 473static int __init
624x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
625 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
626{ 476{
627 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -639,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
639 /* Write the range: */ 489 /* Write the range: */
640 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
641 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
642 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
643 } 493 }
644 494
645 /* Write the last range: */ 495 /* Write the last range: */
@@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
742 unsigned long x_remove_base, 592 unsigned long x_remove_base,
743 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
744{ 594{
745 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
746 unsigned long range_sums_new; 596 unsigned long range_sums_new;
747 static int nr_range_new; 597 static int nr_range_new;
748 int num_reg; 598 int num_reg;
@@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
869 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
870 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
871 */ 721 */
872 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
873 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
874 /* Sort the ranges: */ 724 /* Sort the ranges: */
875 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
876 726
877 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
878 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1089,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1089 nr_range = 0; 939 nr_range = 0;
1090 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1091 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1092 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1093 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1094 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1095 nr_range++; 945 nr_range++;
1096 } 946 }
1097 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1103,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1103 953
1104 /* Check the holes: */ 954 /* Check the holes: */
1105 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1106 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1107 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1108 range[i+1].start); 958 range[i+1].start);
1109 } 959 }
1110 960
1111 /* Check the top: */ 961 /* Check the top: */
1112 i = nr_range - 1; 962 i = nr_range - 1;
1113 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1114 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1115 end_pfn); 965 end_pfn);
1116 966
1117 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9aa5dc76ff4..fd31a441c61 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/io.h> 9#include <linux/io.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12 11
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index e006e56f699..79289632cb2 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
8#include <linux/init.h> 9#include <linux/init.h>
9 10
10#define LINE_SIZE 80 11#define LINE_SIZE 80
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index fe4622e8c83..79556bd9b60 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 641ccb9dddb..db5bdc8addf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -21,6 +21,7 @@
21#include <linux/kdebug.h> 21#include <linux/kdebug.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h>
24#include <linux/highmem.h> 25#include <linux/highmem.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/bitops.h> 27#include <linux/bitops.h>
@@ -28,6 +29,7 @@
28#include <asm/apic.h> 29#include <asm/apic.h>
29#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
30#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h>
31 33
32static u64 perf_event_mask __read_mostly; 34static u64 perf_event_mask __read_mostly;
33 35
@@ -73,10 +75,10 @@ struct debug_store {
73struct event_constraint { 75struct event_constraint {
74 union { 76 union {
75 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 77 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
76 u64 idxmsk64[1]; 78 u64 idxmsk64;
77 }; 79 };
78 int code; 80 u64 code;
79 int cmask; 81 u64 cmask;
80 int weight; 82 int weight;
81}; 83};
82 84
@@ -103,7 +105,7 @@ struct cpu_hw_events {
103}; 105};
104 106
105#define __EVENT_CONSTRAINT(c, n, m, w) {\ 107#define __EVENT_CONSTRAINT(c, n, m, w) {\
106 { .idxmsk64[0] = (n) }, \ 108 { .idxmsk64 = (n) }, \
107 .code = (c), \ 109 .code = (c), \
108 .cmask = (m), \ 110 .cmask = (m), \
109 .weight = (w), \ 111 .weight = (w), \
@@ -116,7 +118,7 @@ struct cpu_hw_events {
116 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) 118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
117 119
118#define FIXED_EVENT_CONSTRAINT(c, n) \ 120#define FIXED_EVENT_CONSTRAINT(c, n) \
119 EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) 121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
120 122
121#define EVENT_CONSTRAINT_END \ 123#define EVENT_CONSTRAINT_END \
122 EVENT_CONSTRAINT(0, 0, 0) 124 EVENT_CONSTRAINT(0, 0, 0)
@@ -133,8 +135,8 @@ struct x86_pmu {
133 int (*handle_irq)(struct pt_regs *); 135 int (*handle_irq)(struct pt_regs *);
134 void (*disable_all)(void); 136 void (*disable_all)(void);
135 void (*enable_all)(void); 137 void (*enable_all)(void);
136 void (*enable)(struct hw_perf_event *, int); 138 void (*enable)(struct perf_event *);
137 void (*disable)(struct hw_perf_event *, int); 139 void (*disable)(struct perf_event *);
138 unsigned eventsel; 140 unsigned eventsel;
139 unsigned perfctr; 141 unsigned perfctr;
140 u64 (*event_map)(int); 142 u64 (*event_map)(int);
@@ -157,6 +159,11 @@ struct x86_pmu {
157 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 159 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
158 struct perf_event *event); 160 struct perf_event *event);
159 struct event_constraint *event_constraints; 161 struct event_constraint *event_constraints;
162
163 int (*cpu_prepare)(int cpu);
164 void (*cpu_starting)(int cpu);
165 void (*cpu_dying)(int cpu);
166 void (*cpu_dead)(int cpu);
160}; 167};
161 168
162static struct x86_pmu x86_pmu __read_mostly; 169static struct x86_pmu x86_pmu __read_mostly;
@@ -165,8 +172,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
165 .enabled = 1, 172 .enabled = 1,
166}; 173};
167 174
168static int x86_perf_event_set_period(struct perf_event *event, 175static int x86_perf_event_set_period(struct perf_event *event);
169 struct hw_perf_event *hwc, int idx);
170 176
171/* 177/*
172 * Generalized hw caching related hw_event table, filled 178 * Generalized hw caching related hw_event table, filled
@@ -189,11 +195,12 @@ static u64 __read_mostly hw_cache_event_ids
189 * Returns the delta events processed. 195 * Returns the delta events processed.
190 */ 196 */
191static u64 197static u64
192x86_perf_event_update(struct perf_event *event, 198x86_perf_event_update(struct perf_event *event)
193 struct hw_perf_event *hwc, int idx)
194{ 199{
200 struct hw_perf_event *hwc = &event->hw;
195 int shift = 64 - x86_pmu.event_bits; 201 int shift = 64 - x86_pmu.event_bits;
196 u64 prev_raw_count, new_raw_count; 202 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx;
197 s64 delta; 204 s64 delta;
198 205
199 if (idx == X86_PMC_IDX_FIXED_BTS) 206 if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -293,7 +300,7 @@ static inline bool bts_available(void)
293 return x86_pmu.enable_bts != NULL; 300 return x86_pmu.enable_bts != NULL;
294} 301}
295 302
296static inline void init_debug_store_on_cpu(int cpu) 303static void init_debug_store_on_cpu(int cpu)
297{ 304{
298 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
299 306
@@ -305,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu)
305 (u32)((u64)(unsigned long)ds >> 32)); 312 (u32)((u64)(unsigned long)ds >> 32));
306} 313}
307 314
308static inline void fini_debug_store_on_cpu(int cpu) 315static void fini_debug_store_on_cpu(int cpu)
309{ 316{
310 if (!per_cpu(cpu_hw_events, cpu).ds) 317 if (!per_cpu(cpu_hw_events, cpu).ds)
311 return; 318 return;
@@ -503,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event)
503 */ 510 */
504 if (attr->type == PERF_TYPE_RAW) { 511 if (attr->type == PERF_TYPE_RAW) {
505 hwc->config |= x86_pmu.raw_event(attr->config); 512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
506 return 0; 516 return 0;
507 } 517 }
508 518
@@ -553,9 +563,9 @@ static void x86_pmu_disable_all(void)
553 if (!test_bit(idx, cpuc->active_mask)) 563 if (!test_bit(idx, cpuc->active_mask))
554 continue; 564 continue;
555 rdmsrl(x86_pmu.eventsel + idx, val); 565 rdmsrl(x86_pmu.eventsel + idx, val);
556 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 566 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
557 continue; 567 continue;
558 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 568 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
559 wrmsrl(x86_pmu.eventsel + idx, val); 569 wrmsrl(x86_pmu.eventsel + idx, val);
560 } 570 }
561} 571}
@@ -590,7 +600,7 @@ static void x86_pmu_enable_all(void)
590 continue; 600 continue;
591 601
592 val = event->hw.config; 602 val = event->hw.config;
593 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 603 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
594 wrmsrl(x86_pmu.eventsel + idx, val); 604 wrmsrl(x86_pmu.eventsel + idx, val);
595 } 605 }
596} 606}
@@ -612,8 +622,8 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
612 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 622 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
613 623
614 for (i = 0; i < n; i++) { 624 for (i = 0; i < n; i++) {
615 constraints[i] = 625 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
616 x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); 626 constraints[i] = c;
617 } 627 }
618 628
619 /* 629 /*
@@ -635,7 +645,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
635 if (test_bit(hwc->idx, used_mask)) 645 if (test_bit(hwc->idx, used_mask))
636 break; 646 break;
637 647
638 set_bit(hwc->idx, used_mask); 648 __set_bit(hwc->idx, used_mask);
639 if (assign) 649 if (assign)
640 assign[i] = hwc->idx; 650 assign[i] = hwc->idx;
641 } 651 }
@@ -676,7 +686,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
676 if (c->weight != w) 686 if (c->weight != w)
677 continue; 687 continue;
678 688
679 for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { 689 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
680 if (!test_bit(j, used_mask)) 690 if (!test_bit(j, used_mask))
681 break; 691 break;
682 } 692 }
@@ -684,7 +694,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
684 if (j == X86_PMC_IDX_MAX) 694 if (j == X86_PMC_IDX_MAX)
685 break; 695 break;
686 696
687 set_bit(j, used_mask); 697 __set_bit(j, used_mask);
688 698
689 if (assign) 699 if (assign)
690 assign[i] = j; 700 assign[i] = j;
@@ -777,6 +787,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
777 hwc->last_tag == cpuc->tags[i]; 787 hwc->last_tag == cpuc->tags[i];
778} 788}
779 789
790static int x86_pmu_start(struct perf_event *event);
780static void x86_pmu_stop(struct perf_event *event); 791static void x86_pmu_stop(struct perf_event *event);
781 792
782void hw_perf_enable(void) 793void hw_perf_enable(void)
@@ -793,6 +804,7 @@ void hw_perf_enable(void)
793 return; 804 return;
794 805
795 if (cpuc->n_added) { 806 if (cpuc->n_added) {
807 int n_running = cpuc->n_events - cpuc->n_added;
796 /* 808 /*
797 * apply assignment obtained either from 809 * apply assignment obtained either from
798 * hw_perf_group_sched_in() or x86_pmu_enable() 810 * hw_perf_group_sched_in() or x86_pmu_enable()
@@ -800,8 +812,7 @@ void hw_perf_enable(void)
800 * step1: save events moving to new counters 812 * step1: save events moving to new counters
801 * step2: reprogram moved events into new counters 813 * step2: reprogram moved events into new counters
802 */ 814 */
803 for (i = 0; i < cpuc->n_events; i++) { 815 for (i = 0; i < n_running; i++) {
804
805 event = cpuc->event_list[i]; 816 event = cpuc->event_list[i];
806 hwc = &event->hw; 817 hwc = &event->hw;
807 818
@@ -816,29 +827,18 @@ void hw_perf_enable(void)
816 continue; 827 continue;
817 828
818 x86_pmu_stop(event); 829 x86_pmu_stop(event);
819
820 hwc->idx = -1;
821 } 830 }
822 831
823 for (i = 0; i < cpuc->n_events; i++) { 832 for (i = 0; i < cpuc->n_events; i++) {
824
825 event = cpuc->event_list[i]; 833 event = cpuc->event_list[i];
826 hwc = &event->hw; 834 hwc = &event->hw;
827 835
828 if (hwc->idx == -1) { 836 if (!match_prev_assignment(hwc, cpuc, i))
829 x86_assign_hw_event(event, cpuc, i); 837 x86_assign_hw_event(event, cpuc, i);
830 x86_perf_event_set_period(event, hwc, hwc->idx); 838 else if (i < n_running)
831 } 839 continue;
832 /*
833 * need to mark as active because x86_pmu_disable()
834 * clear active_mask and events[] yet it preserves
835 * idx
836 */
837 set_bit(hwc->idx, cpuc->active_mask);
838 cpuc->events[hwc->idx] = event;
839 840
840 x86_pmu.enable(hwc, hwc->idx); 841 x86_pmu_start(event);
841 perf_event_update_userpage(event);
842 } 842 }
843 cpuc->n_added = 0; 843 cpuc->n_added = 0;
844 perf_events_lapic_init(); 844 perf_events_lapic_init();
@@ -850,15 +850,16 @@ void hw_perf_enable(void)
850 x86_pmu.enable_all(); 850 x86_pmu.enable_all();
851} 851}
852 852
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
854{ 854{
855 (void)checking_wrmsrl(hwc->config_base + idx, 855 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
856 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857} 857}
858 858
859static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 859static inline void x86_pmu_disable_event(struct perf_event *event)
860{ 860{
861 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 861 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
862} 863}
863 864
864static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -868,12 +869,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
868 * To be called with the event disabled in hw: 869 * To be called with the event disabled in hw:
869 */ 870 */
870static int 871static int
871x86_perf_event_set_period(struct perf_event *event, 872x86_perf_event_set_period(struct perf_event *event)
872 struct hw_perf_event *hwc, int idx)
873{ 873{
874 struct hw_perf_event *hwc = &event->hw;
874 s64 left = atomic64_read(&hwc->period_left); 875 s64 left = atomic64_read(&hwc->period_left);
875 s64 period = hwc->sample_period; 876 s64 period = hwc->sample_period;
876 int err, ret = 0; 877 int err, ret = 0, idx = hwc->idx;
877 878
878 if (idx == X86_PMC_IDX_FIXED_BTS) 879 if (idx == X86_PMC_IDX_FIXED_BTS)
879 return 0; 880 return 0;
@@ -919,11 +920,11 @@ x86_perf_event_set_period(struct perf_event *event,
919 return ret; 920 return ret;
920} 921}
921 922
922static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 923static void x86_pmu_enable_event(struct perf_event *event)
923{ 924{
924 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
925 if (cpuc->enabled) 926 if (cpuc->enabled)
926 __x86_pmu_enable_event(hwc, idx); 927 __x86_pmu_enable_event(&event->hw);
927} 928}
928 929
929/* 930/*
@@ -959,34 +960,32 @@ static int x86_pmu_enable(struct perf_event *event)
959 memcpy(cpuc->assign, assign, n*sizeof(int)); 960 memcpy(cpuc->assign, assign, n*sizeof(int));
960 961
961 cpuc->n_events = n; 962 cpuc->n_events = n;
962 cpuc->n_added = n - n0; 963 cpuc->n_added += n - n0;
963 964
964 return 0; 965 return 0;
965} 966}
966 967
967static int x86_pmu_start(struct perf_event *event) 968static int x86_pmu_start(struct perf_event *event)
968{ 969{
969 struct hw_perf_event *hwc = &event->hw; 970 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
971 int idx = event->hw.idx;
970 972
971 if (hwc->idx == -1) 973 if (idx == -1)
972 return -EAGAIN; 974 return -EAGAIN;
973 975
974 x86_perf_event_set_period(event, hwc, hwc->idx); 976 x86_perf_event_set_period(event);
975 x86_pmu.enable(hwc, hwc->idx); 977 cpuc->events[idx] = event;
978 __set_bit(idx, cpuc->active_mask);
979 x86_pmu.enable(event);
980 perf_event_update_userpage(event);
976 981
977 return 0; 982 return 0;
978} 983}
979 984
980static void x86_pmu_unthrottle(struct perf_event *event) 985static void x86_pmu_unthrottle(struct perf_event *event)
981{ 986{
982 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 987 int ret = x86_pmu_start(event);
983 struct hw_perf_event *hwc = &event->hw; 988 WARN_ON_ONCE(ret);
984
985 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
986 cpuc->events[hwc->idx] != event))
987 return;
988
989 x86_pmu.enable(hwc, hwc->idx);
990} 989}
991 990
992void perf_event_print_debug(void) 991void perf_event_print_debug(void)
@@ -1046,18 +1045,16 @@ static void x86_pmu_stop(struct perf_event *event)
1046 struct hw_perf_event *hwc = &event->hw; 1045 struct hw_perf_event *hwc = &event->hw;
1047 int idx = hwc->idx; 1046 int idx = hwc->idx;
1048 1047
1049 /* 1048 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1050 * Must be done before we disable, otherwise the nmi handler 1049 return;
1051 * could reenable again: 1050
1052 */ 1051 x86_pmu.disable(event);
1053 clear_bit(idx, cpuc->active_mask);
1054 x86_pmu.disable(hwc, idx);
1055 1052
1056 /* 1053 /*
1057 * Drain the remaining delta count out of a event 1054 * Drain the remaining delta count out of a event
1058 * that we are disabling: 1055 * that we are disabling:
1059 */ 1056 */
1060 x86_perf_event_update(event, hwc, idx); 1057 x86_perf_event_update(event);
1061 1058
1062 cpuc->events[idx] = NULL; 1059 cpuc->events[idx] = NULL;
1063} 1060}
@@ -1094,8 +1091,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1094 int idx, handled = 0; 1091 int idx, handled = 0;
1095 u64 val; 1092 u64 val;
1096 1093
1097 data.addr = 0; 1094 perf_sample_data_init(&data, 0);
1098 data.raw = NULL;
1099 1095
1100 cpuc = &__get_cpu_var(cpu_hw_events); 1096 cpuc = &__get_cpu_var(cpu_hw_events);
1101 1097
@@ -1106,7 +1102,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1106 event = cpuc->events[idx]; 1102 event = cpuc->events[idx];
1107 hwc = &event->hw; 1103 hwc = &event->hw;
1108 1104
1109 val = x86_perf_event_update(event, hwc, idx); 1105 val = x86_perf_event_update(event);
1110 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1106 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1111 continue; 1107 continue;
1112 1108
@@ -1116,11 +1112,11 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1116 handled = 1; 1112 handled = 1;
1117 data.period = event->hw.last_period; 1113 data.period = event->hw.last_period;
1118 1114
1119 if (!x86_perf_event_set_period(event, hwc, idx)) 1115 if (!x86_perf_event_set_period(event))
1120 continue; 1116 continue;
1121 1117
1122 if (perf_event_overflow(event, 1, &data, regs)) 1118 if (perf_event_overflow(event, 1, &data, regs))
1123 x86_pmu.disable(hwc, idx); 1119 x86_pmu_stop(event);
1124 } 1120 }
1125 1121
1126 if (handled) 1122 if (handled)
@@ -1307,7 +1303,7 @@ int hw_perf_group_sched_in(struct perf_event *leader,
1307 memcpy(cpuc->assign, assign, n0*sizeof(int)); 1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1308 1304
1309 cpuc->n_events = n0; 1305 cpuc->n_events = n0;
1310 cpuc->n_added = n1; 1306 cpuc->n_added += n1;
1311 ctx->nr_active += n1; 1307 ctx->nr_active += n1;
1312 1308
1313 /* 1309 /*
@@ -1335,6 +1331,41 @@ undo:
1335#include "perf_event_p6.c" 1331#include "perf_event_p6.c"
1336#include "perf_event_intel.c" 1332#include "perf_event_intel.c"
1337 1333
1334static int __cpuinit
1335x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1336{
1337 unsigned int cpu = (long)hcpu;
1338 int ret = NOTIFY_OK;
1339
1340 switch (action & ~CPU_TASKS_FROZEN) {
1341 case CPU_UP_PREPARE:
1342 if (x86_pmu.cpu_prepare)
1343 ret = x86_pmu.cpu_prepare(cpu);
1344 break;
1345
1346 case CPU_STARTING:
1347 if (x86_pmu.cpu_starting)
1348 x86_pmu.cpu_starting(cpu);
1349 break;
1350
1351 case CPU_DYING:
1352 if (x86_pmu.cpu_dying)
1353 x86_pmu.cpu_dying(cpu);
1354 break;
1355
1356 case CPU_UP_CANCELED:
1357 case CPU_DEAD:
1358 if (x86_pmu.cpu_dead)
1359 x86_pmu.cpu_dead(cpu);
1360 break;
1361
1362 default:
1363 break;
1364 }
1365
1366 return ret;
1367}
1368
1338static void __init pmu_check_apic(void) 1369static void __init pmu_check_apic(void)
1339{ 1370{
1340 if (cpu_has_apic) 1371 if (cpu_has_apic)
@@ -1347,6 +1378,7 @@ static void __init pmu_check_apic(void)
1347 1378
1348void __init init_hw_perf_events(void) 1379void __init init_hw_perf_events(void)
1349{ 1380{
1381 struct event_constraint *c;
1350 int err; 1382 int err;
1351 1383
1352 pr_info("Performance Events: "); 1384 pr_info("Performance Events: ");
@@ -1395,6 +1427,16 @@ void __init init_hw_perf_events(void)
1395 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
1396 0, x86_pmu.num_events); 1428 0, x86_pmu.num_events);
1397 1429
1430 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK)
1433 continue;
1434
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
1436 c->weight += x86_pmu.num_events;
1437 }
1438 }
1439
1398 pr_info("... version: %d\n", x86_pmu.version); 1440 pr_info("... version: %d\n", x86_pmu.version);
1399 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1441 pr_info("... bit width: %d\n", x86_pmu.event_bits);
1400 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1442 pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -1402,11 +1444,13 @@ void __init init_hw_perf_events(void)
1402 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1403 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
1404 pr_info("... event mask: %016Lx\n", perf_event_mask); 1446 pr_info("... event mask: %016Lx\n", perf_event_mask);
1447
1448 perf_cpu_notifier(x86_pmu_notifier);
1405} 1449}
1406 1450
1407static inline void x86_pmu_read(struct perf_event *event) 1451static inline void x86_pmu_read(struct perf_event *event)
1408{ 1452{
1409 x86_perf_event_update(event, &event->hw, event->hw.idx); 1453 x86_perf_event_update(event);
1410} 1454}
1411 1455
1412static const struct pmu pmu = { 1456static const struct pmu pmu = {
@@ -1588,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1588 return len; 1632 return len;
1589} 1633}
1590 1634
1591static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1635#ifdef CONFIG_COMPAT
1636static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1592{ 1638{
1593 unsigned long bytes; 1639 /* 32-bit process in 64-bit kernel. */
1640 struct stack_frame_ia32 frame;
1641 const void __user *fp;
1594 1642
1595 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); 1643 if (!test_thread_flag(TIF_IA32))
1644 return 0;
1596 1645
1597 return bytes == sizeof(*frame); 1646 fp = compat_ptr(regs->bp);
1647 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1648 unsigned long bytes;
1649 frame.next_frame = 0;
1650 frame.return_address = 0;
1651
1652 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1653 if (bytes != sizeof(frame))
1654 break;
1655
1656 if (fp < compat_ptr(regs->sp))
1657 break;
1658
1659 callchain_store(entry, frame.return_address);
1660 fp = compat_ptr(frame.next_frame);
1661 }
1662 return 1;
1663}
1664#else
1665static inline int
1666perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1667{
1668 return 0;
1598} 1669}
1670#endif
1599 1671
1600static void 1672static void
1601perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1673perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1611,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1611 callchain_store(entry, PERF_CONTEXT_USER); 1683 callchain_store(entry, PERF_CONTEXT_USER);
1612 callchain_store(entry, regs->ip); 1684 callchain_store(entry, regs->ip);
1613 1685
1686 if (perf_callchain_user32(regs, entry))
1687 return;
1688
1614 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1689 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1690 unsigned long bytes;
1615 frame.next_frame = NULL; 1691 frame.next_frame = NULL;
1616 frame.return_address = 0; 1692 frame.return_address = 0;
1617 1693
1618 if (!copy_stack_frame(fp, &frame)) 1694 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1695 if (bytes != sizeof(frame))
1619 break; 1696 break;
1620 1697
1621 if ((unsigned long)fp < regs->sp) 1698 if ((unsigned long)fp < regs->sp)
@@ -1662,28 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1662 return entry; 1739 return entry;
1663} 1740}
1664 1741
1665void hw_perf_event_setup_online(int cpu) 1742void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
1666{
1667 init_debug_store_on_cpu(cpu);
1668
1669 switch (boot_cpu_data.x86_vendor) {
1670 case X86_VENDOR_AMD:
1671 amd_pmu_cpu_online(cpu);
1672 break;
1673 default:
1674 return;
1675 }
1676}
1677
1678void hw_perf_event_setup_offline(int cpu)
1679{ 1743{
1680 init_debug_store_on_cpu(cpu); 1744 regs->ip = ip;
1681 1745 /*
1682 switch (boot_cpu_data.x86_vendor) { 1746 * perf_arch_fetch_caller_regs adds another call, we need to increment
1683 case X86_VENDOR_AMD: 1747 * the skip level
1684 amd_pmu_cpu_offline(cpu); 1748 */
1685 break; 1749 regs->bp = rewind_frame_pointer(skip + 1);
1686 default: 1750 regs->cs = __KERNEL_CS;
1687 return; 1751 local_save_flags(regs->flags);
1688 }
1689} 1752}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 8f3dbfda3c4..db6f7d4056e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -137,6 +137,13 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc)
137 return (hwc->config & 0xe0) == 0xe0; 137 return (hwc->config & 0xe0) == 0xe0;
138} 138}
139 139
140static inline int amd_has_nb(struct cpu_hw_events *cpuc)
141{
142 struct amd_nb *nb = cpuc->amd_nb;
143
144 return nb && nb->nb_id != -1;
145}
146
140static void amd_put_event_constraints(struct cpu_hw_events *cpuc, 147static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
141 struct perf_event *event) 148 struct perf_event *event)
142{ 149{
@@ -147,7 +154,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
147 /* 154 /*
148 * only care about NB events 155 * only care about NB events
149 */ 156 */
150 if (!(nb && amd_is_nb_event(hwc))) 157 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
151 return; 158 return;
152 159
153 /* 160 /*
@@ -214,7 +221,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
214 /* 221 /*
215 * if not NB event or no NB, then no constraints 222 * if not NB event or no NB, then no constraints
216 */ 223 */
217 if (!(nb && amd_is_nb_event(hwc))) 224 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
218 return &unconstrained; 225 return &unconstrained;
219 226
220 /* 227 /*
@@ -271,28 +278,6 @@ done:
271 return &emptyconstraint; 278 return &emptyconstraint;
272} 279}
273 280
274static __initconst struct x86_pmu amd_pmu = {
275 .name = "AMD",
276 .handle_irq = x86_pmu_handle_irq,
277 .disable_all = x86_pmu_disable_all,
278 .enable_all = x86_pmu_enable_all,
279 .enable = x86_pmu_enable_event,
280 .disable = x86_pmu_disable_event,
281 .eventsel = MSR_K7_EVNTSEL0,
282 .perfctr = MSR_K7_PERFCTR0,
283 .event_map = amd_pmu_event_map,
284 .raw_event = amd_pmu_raw_event,
285 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
286 .num_events = 4,
287 .event_bits = 48,
288 .event_mask = (1ULL << 48) - 1,
289 .apic = 1,
290 /* use highest bit to detect overflow */
291 .max_period = (1ULL << 47) - 1,
292 .get_event_constraints = amd_get_event_constraints,
293 .put_event_constraints = amd_put_event_constraints
294};
295
296static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) 281static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
297{ 282{
298 struct amd_nb *nb; 283 struct amd_nb *nb;
@@ -309,57 +294,61 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
309 * initialize all possible NB constraints 294 * initialize all possible NB constraints
310 */ 295 */
311 for (i = 0; i < x86_pmu.num_events; i++) { 296 for (i = 0; i < x86_pmu.num_events; i++) {
312 set_bit(i, nb->event_constraints[i].idxmsk); 297 __set_bit(i, nb->event_constraints[i].idxmsk);
313 nb->event_constraints[i].weight = 1; 298 nb->event_constraints[i].weight = 1;
314 } 299 }
315 return nb; 300 return nb;
316} 301}
317 302
318static void amd_pmu_cpu_online(int cpu) 303static int amd_pmu_cpu_prepare(int cpu)
319{ 304{
320 struct cpu_hw_events *cpu1, *cpu2; 305 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
321 struct amd_nb *nb = NULL; 306
307 WARN_ON_ONCE(cpuc->amd_nb);
308
309 if (boot_cpu_data.x86_max_cores < 2)
310 return NOTIFY_OK;
311
312 cpuc->amd_nb = amd_alloc_nb(cpu, -1);
313 if (!cpuc->amd_nb)
314 return NOTIFY_BAD;
315
316 return NOTIFY_OK;
317}
318
319static void amd_pmu_cpu_starting(int cpu)
320{
321 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
322 struct amd_nb *nb;
322 int i, nb_id; 323 int i, nb_id;
323 324
324 if (boot_cpu_data.x86_max_cores < 2) 325 if (boot_cpu_data.x86_max_cores < 2)
325 return; 326 return;
326 327
327 /*
328 * function may be called too early in the
329 * boot process, in which case nb_id is bogus
330 */
331 nb_id = amd_get_nb_id(cpu); 328 nb_id = amd_get_nb_id(cpu);
332 if (nb_id == BAD_APICID) 329 WARN_ON_ONCE(nb_id == BAD_APICID);
333 return;
334
335 cpu1 = &per_cpu(cpu_hw_events, cpu);
336 cpu1->amd_nb = NULL;
337 330
338 raw_spin_lock(&amd_nb_lock); 331 raw_spin_lock(&amd_nb_lock);
339 332
340 for_each_online_cpu(i) { 333 for_each_online_cpu(i) {
341 cpu2 = &per_cpu(cpu_hw_events, i); 334 nb = per_cpu(cpu_hw_events, i).amd_nb;
342 nb = cpu2->amd_nb; 335 if (WARN_ON_ONCE(!nb))
343 if (!nb)
344 continue; 336 continue;
345 if (nb->nb_id == nb_id)
346 goto found;
347 }
348 337
349 nb = amd_alloc_nb(cpu, nb_id); 338 if (nb->nb_id == nb_id) {
350 if (!nb) { 339 kfree(cpuc->amd_nb);
351 pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); 340 cpuc->amd_nb = nb;
352 raw_spin_unlock(&amd_nb_lock); 341 break;
353 return; 342 }
354 } 343 }
355found: 344
356 nb->refcnt++; 345 cpuc->amd_nb->nb_id = nb_id;
357 cpu1->amd_nb = nb; 346 cpuc->amd_nb->refcnt++;
358 347
359 raw_spin_unlock(&amd_nb_lock); 348 raw_spin_unlock(&amd_nb_lock);
360} 349}
361 350
362static void amd_pmu_cpu_offline(int cpu) 351static void amd_pmu_cpu_dead(int cpu)
363{ 352{
364 struct cpu_hw_events *cpuhw; 353 struct cpu_hw_events *cpuhw;
365 354
@@ -370,14 +359,44 @@ static void amd_pmu_cpu_offline(int cpu)
370 359
371 raw_spin_lock(&amd_nb_lock); 360 raw_spin_lock(&amd_nb_lock);
372 361
373 if (--cpuhw->amd_nb->refcnt == 0) 362 if (cpuhw->amd_nb) {
374 kfree(cpuhw->amd_nb); 363 struct amd_nb *nb = cpuhw->amd_nb;
364
365 if (nb->nb_id == -1 || --nb->refcnt == 0)
366 kfree(nb);
375 367
376 cpuhw->amd_nb = NULL; 368 cpuhw->amd_nb = NULL;
369 }
377 370
378 raw_spin_unlock(&amd_nb_lock); 371 raw_spin_unlock(&amd_nb_lock);
379} 372}
380 373
374static __initconst struct x86_pmu amd_pmu = {
375 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event,
381 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4,
387 .event_bits = 48,
388 .event_mask = (1ULL << 48) - 1,
389 .apic = 1,
390 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1,
392 .get_event_constraints = amd_get_event_constraints,
393 .put_event_constraints = amd_put_event_constraints,
394
395 .cpu_prepare = amd_pmu_cpu_prepare,
396 .cpu_starting = amd_pmu_cpu_starting,
397 .cpu_dead = amd_pmu_cpu_dead,
398};
399
381static __init int amd_pmu_init(void) 400static __init int amd_pmu_init(void)
382{ 401{
383 /* Performance-monitoring supported from K7 and later: */ 402 /* Performance-monitoring supported from K7 and later: */
@@ -390,11 +409,6 @@ static __init int amd_pmu_init(void)
390 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 409 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
391 sizeof(hw_cache_event_ids)); 410 sizeof(hw_cache_event_ids));
392 411
393 /*
394 * explicitly initialize the boot cpu, other cpus will get
395 * the cpu hotplug callbacks from smp_init()
396 */
397 amd_pmu_cpu_online(smp_processor_id());
398 return 0; 412 return 0;
399} 413}
400 414
@@ -405,12 +419,4 @@ static int amd_pmu_init(void)
405 return 0; 419 return 0;
406} 420}
407 421
408static void amd_pmu_cpu_online(int cpu)
409{
410}
411
412static void amd_pmu_cpu_offline(int cpu)
413{
414}
415
416#endif 422#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index cf6590cf4a5..9c794ac8783 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,7 +1,7 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3/* 3/*
4 * Intel PerfMon v3. Used on Core2 and later. 4 * Intel PerfMon, used on Core and later.
5 */ 5 */
6static const u64 intel_perfmon_event_map[] = 6static const u64 intel_perfmon_event_map[] =
7{ 7{
@@ -27,8 +27,14 @@ static struct event_constraint intel_core_event_constraints[] =
27 27
28static struct event_constraint intel_core2_event_constraints[] = 28static struct event_constraint intel_core2_event_constraints[] =
29{ 29{
30 FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ 30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ 31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
32 /*
33 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
34 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
35 * ratio between these counters.
36 */
37 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
32 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ 38 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
33 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 39 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
34 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 40 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -37,14 +43,16 @@ static struct event_constraint intel_core2_event_constraints[] =
37 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ 43 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
38 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ 44 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
39 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ 45 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
46 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
40 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ 47 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
41 EVENT_CONSTRAINT_END 48 EVENT_CONSTRAINT_END
42}; 49};
43 50
44static struct event_constraint intel_nehalem_event_constraints[] = 51static struct event_constraint intel_nehalem_event_constraints[] =
45{ 52{
46 FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ 53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
47 FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ 54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
55 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
48 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ 56 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
49 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ 57 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
50 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ 58 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -58,8 +66,9 @@ static struct event_constraint intel_nehalem_event_constraints[] =
58 66
59static struct event_constraint intel_westmere_event_constraints[] = 67static struct event_constraint intel_westmere_event_constraints[] =
60{ 68{
61 FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ 69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
62 FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ 70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
63 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ 72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
64 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ 73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
65 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ 74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -68,8 +77,9 @@ static struct event_constraint intel_westmere_event_constraints[] =
68 77
69static struct event_constraint intel_gen_event_constraints[] = 78static struct event_constraint intel_gen_event_constraints[] =
70{ 79{
71 FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ 80 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
72 FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ 81 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
82 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
73 EVENT_CONSTRAINT_END 83 EVENT_CONSTRAINT_END
74}; 84};
75 85
@@ -538,9 +548,9 @@ static inline void intel_pmu_ack_status(u64 ack)
538} 548}
539 549
540static inline void 550static inline void
541intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
542{ 552{
543 int idx = __idx - X86_PMC_IDX_FIXED; 553 int idx = hwc->idx - X86_PMC_IDX_FIXED;
544 u64 ctrl_val, mask; 554 u64 ctrl_val, mask;
545 555
546 mask = 0xfULL << (idx * 4); 556 mask = 0xfULL << (idx * 4);
@@ -580,10 +590,9 @@ static void intel_pmu_drain_bts_buffer(void)
580 590
581 ds->bts_index = ds->bts_buffer_base; 591 ds->bts_index = ds->bts_buffer_base;
582 592
593 perf_sample_data_init(&data, 0);
583 594
584 data.period = event->hw.last_period; 595 data.period = event->hw.last_period;
585 data.addr = 0;
586 data.raw = NULL;
587 regs.ip = 0; 596 regs.ip = 0;
588 597
589 /* 598 /*
@@ -612,26 +621,28 @@ static void intel_pmu_drain_bts_buffer(void)
612} 621}
613 622
614static inline void 623static inline void
615intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 624intel_pmu_disable_event(struct perf_event *event)
616{ 625{
617 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 626 struct hw_perf_event *hwc = &event->hw;
627
628 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
618 intel_pmu_disable_bts(); 629 intel_pmu_disable_bts();
619 intel_pmu_drain_bts_buffer(); 630 intel_pmu_drain_bts_buffer();
620 return; 631 return;
621 } 632 }
622 633
623 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 634 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
624 intel_pmu_disable_fixed(hwc, idx); 635 intel_pmu_disable_fixed(hwc);
625 return; 636 return;
626 } 637 }
627 638
628 x86_pmu_disable_event(hwc, idx); 639 x86_pmu_disable_event(event);
629} 640}
630 641
631static inline void 642static inline void
632intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) 643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
633{ 644{
634 int idx = __idx - X86_PMC_IDX_FIXED; 645 int idx = hwc->idx - X86_PMC_IDX_FIXED;
635 u64 ctrl_val, bits, mask; 646 u64 ctrl_val, bits, mask;
636 int err; 647 int err;
637 648
@@ -661,9 +672,11 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
661 err = checking_wrmsrl(hwc->config_base, ctrl_val); 672 err = checking_wrmsrl(hwc->config_base, ctrl_val);
662} 673}
663 674
664static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) 675static void intel_pmu_enable_event(struct perf_event *event)
665{ 676{
666 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 677 struct hw_perf_event *hwc = &event->hw;
678
679 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
667 if (!__get_cpu_var(cpu_hw_events).enabled) 680 if (!__get_cpu_var(cpu_hw_events).enabled)
668 return; 681 return;
669 682
@@ -672,11 +685,11 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
672 } 685 }
673 686
674 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 687 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
675 intel_pmu_enable_fixed(hwc, idx); 688 intel_pmu_enable_fixed(hwc);
676 return; 689 return;
677 } 690 }
678 691
679 __x86_pmu_enable_event(hwc, idx); 692 __x86_pmu_enable_event(hwc);
680} 693}
681 694
682/* 695/*
@@ -685,14 +698,8 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
685 */ 698 */
686static int intel_pmu_save_and_restart(struct perf_event *event) 699static int intel_pmu_save_and_restart(struct perf_event *event)
687{ 700{
688 struct hw_perf_event *hwc = &event->hw; 701 x86_perf_event_update(event);
689 int idx = hwc->idx; 702 return x86_perf_event_set_period(event);
690 int ret;
691
692 x86_perf_event_update(event, hwc, idx);
693 ret = x86_perf_event_set_period(event, hwc, idx);
694
695 return ret;
696} 703}
697 704
698static void intel_pmu_reset(void) 705static void intel_pmu_reset(void)
@@ -732,16 +739,15 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
732 int bit, loops; 739 int bit, loops;
733 u64 ack, status; 740 u64 ack, status;
734 741
735 data.addr = 0; 742 perf_sample_data_init(&data, 0);
736 data.raw = NULL;
737 743
738 cpuc = &__get_cpu_var(cpu_hw_events); 744 cpuc = &__get_cpu_var(cpu_hw_events);
739 745
740 perf_disable(); 746 intel_pmu_disable_all();
741 intel_pmu_drain_bts_buffer(); 747 intel_pmu_drain_bts_buffer();
742 status = intel_pmu_get_status(); 748 status = intel_pmu_get_status();
743 if (!status) { 749 if (!status) {
744 perf_enable(); 750 intel_pmu_enable_all();
745 return 0; 751 return 0;
746 } 752 }
747 753
@@ -751,16 +757,14 @@ again:
751 WARN_ONCE(1, "perfevents: irq loop stuck!\n"); 757 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
752 perf_event_print_debug(); 758 perf_event_print_debug();
753 intel_pmu_reset(); 759 intel_pmu_reset();
754 perf_enable(); 760 goto done;
755 return 1;
756 } 761 }
757 762
758 inc_irq_stat(apic_perf_irqs); 763 inc_irq_stat(apic_perf_irqs);
759 ack = status; 764 ack = status;
760 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
761 struct perf_event *event = cpuc->events[bit]; 766 struct perf_event *event = cpuc->events[bit];
762 767
763 clear_bit(bit, (unsigned long *) &status);
764 if (!test_bit(bit, cpuc->active_mask)) 768 if (!test_bit(bit, cpuc->active_mask))
765 continue; 769 continue;
766 770
@@ -770,7 +774,7 @@ again:
770 data.period = event->hw.last_period; 774 data.period = event->hw.last_period;
771 775
772 if (perf_event_overflow(event, 1, &data, regs)) 776 if (perf_event_overflow(event, 1, &data, regs))
773 intel_pmu_disable_event(&event->hw, bit); 777 x86_pmu_stop(event);
774 } 778 }
775 779
776 intel_pmu_ack_status(ack); 780 intel_pmu_ack_status(ack);
@@ -782,8 +786,8 @@ again:
782 if (status) 786 if (status)
783 goto again; 787 goto again;
784 788
785 perf_enable(); 789done:
786 790 intel_pmu_enable_all();
787 return 1; 791 return 1;
788} 792}
789 793
@@ -862,7 +866,10 @@ static __initconst struct x86_pmu intel_pmu = {
862 .max_period = (1ULL << 31) - 1, 866 .max_period = (1ULL << 31) - 1,
863 .enable_bts = intel_pmu_enable_bts, 867 .enable_bts = intel_pmu_enable_bts,
864 .disable_bts = intel_pmu_disable_bts, 868 .disable_bts = intel_pmu_disable_bts,
865 .get_event_constraints = intel_get_event_constraints 869 .get_event_constraints = intel_get_event_constraints,
870
871 .cpu_starting = init_debug_store_on_cpu,
872 .cpu_dying = fini_debug_store_on_cpu,
866}; 873};
867 874
868static __init int intel_pmu_init(void) 875static __init int intel_pmu_init(void)
@@ -929,13 +936,14 @@ static __init int intel_pmu_init(void)
929 936
930 case 26: /* 45 nm nehalem, "Bloomfield" */ 937 case 26: /* 45 nm nehalem, "Bloomfield" */
931 case 30: /* 45 nm nehalem, "Lynnfield" */ 938 case 30: /* 45 nm nehalem, "Lynnfield" */
939 case 46: /* 45 nm nehalem-ex, "Beckton" */
932 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
933 sizeof(hw_cache_event_ids)); 941 sizeof(hw_cache_event_ids));
934 942
935 x86_pmu.event_constraints = intel_nehalem_event_constraints; 943 x86_pmu.event_constraints = intel_nehalem_event_constraints;
936 pr_cont("Nehalem/Corei7 events, "); 944 pr_cont("Nehalem/Corei7 events, ");
937 break; 945 break;
938 case 28: 946 case 28: /* Atom */
939 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
940 sizeof(hw_cache_event_ids)); 948 sizeof(hw_cache_event_ids));
941 949
@@ -951,6 +959,7 @@ static __init int intel_pmu_init(void)
951 x86_pmu.event_constraints = intel_westmere_event_constraints; 959 x86_pmu.event_constraints = intel_westmere_event_constraints;
952 pr_cont("Westmere events, "); 960 pr_cont("Westmere events, ");
953 break; 961 break;
962
954 default: 963 default:
955 /* 964 /*
956 * default constraints for v2 and up 965 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 1ca5ba078af..a330485d14d 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -62,7 +62,7 @@ static void p6_pmu_disable_all(void)
62 62
63 /* p6 only has one enable register */ 63 /* p6 only has one enable register */
64 rdmsrl(MSR_P6_EVNTSEL0, val); 64 rdmsrl(MSR_P6_EVNTSEL0, val);
65 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 65 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
66 wrmsrl(MSR_P6_EVNTSEL0, val); 66 wrmsrl(MSR_P6_EVNTSEL0, val);
67} 67}
68 68
@@ -72,32 +72,34 @@ static void p6_pmu_enable_all(void)
72 72
73 /* p6 only has one enable register */ 73 /* p6 only has one enable register */
74 rdmsrl(MSR_P6_EVNTSEL0, val); 74 rdmsrl(MSR_P6_EVNTSEL0, val);
75 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 75 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
76 wrmsrl(MSR_P6_EVNTSEL0, val); 76 wrmsrl(MSR_P6_EVNTSEL0, val);
77} 77}
78 78
79static inline void 79static inline void
80p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 80p6_pmu_disable_event(struct perf_event *event)
81{ 81{
82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
83 struct hw_perf_event *hwc = &event->hw;
83 u64 val = P6_NOP_EVENT; 84 u64 val = P6_NOP_EVENT;
84 85
85 if (cpuc->enabled) 86 if (cpuc->enabled)
86 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 87 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
87 88
88 (void)checking_wrmsrl(hwc->config_base + idx, val); 89 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
89} 90}
90 91
91static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) 92static void p6_pmu_enable_event(struct perf_event *event)
92{ 93{
93 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 94 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
95 struct hw_perf_event *hwc = &event->hw;
94 u64 val; 96 u64 val;
95 97
96 val = hwc->config; 98 val = hwc->config;
97 if (cpuc->enabled) 99 if (cpuc->enabled)
98 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 100 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
99 101
100 (void)checking_wrmsrl(hwc->config_base + idx, val); 102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
101} 103}
102 104
103static __initconst struct x86_pmu p6_pmu = { 105static __initconst struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 74f4e85a572..fb329e9f849 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -680,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
680 cpu_nmi_set_wd_enabled(); 680 cpu_nmi_set_wd_enabled();
681 681
682 apic_write(APIC_LVTPC, APIC_DM_NMI); 682 apic_write(APIC_LVTPC, APIC_DM_NMI);
683 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
684 wrmsr(evntsel_msr, evntsel, 0); 684 wrmsr(evntsel_msr, evntsel, 0);
685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
686 return 1; 686 return 1;
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59c..dfdb4dba232 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include <asm/vmware.h> 27#include <asm/vmware.h>
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)
101 102
102 return 0; 103 return 0;
103} 104}
105EXPORT_SYMBOL(vmware_platform);
104 106
105/* 107/*
106 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 108 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 83e5e628de7..8b862d5900f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/uaccess.h> 42#include <linux/uaccess.h>
43#include <linux/gfp.h>
43 44
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/msr.h> 46#include <asm/msr.h>
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a4849c10a77..ebd4c51d096 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,7 +27,6 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/x86_init.h>
31 30
32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
33 32
@@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
103#ifdef CONFIG_HPET_TIMER 102#ifdef CONFIG_HPET_TIMER
104 hpet_disable(); 103 hpet_disable();
105#endif 104#endif
106
107#ifdef CONFIG_X86_64
108 x86_platform.iommu_shutdown();
109#endif
110
111 crash_save_cpu(regs, safe_smp_processor_id()); 105 crash_save_cpu(regs, safe_smp_processor_id());
112} 106}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index cd97ce18c29..67414550c3c 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -5,6 +5,7 @@
5 * Copyright (C) IBM Corporation, 2004. All rights reserved 5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */ 6 */
7 7
8#include <linux/slab.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/highmem.h> 10#include <linux/highmem.h>
10#include <linux/crash_dump.h> 11#include <linux/crash_dump.h>
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 4fd1420faff..e1a93be4fd4 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,6 +14,8 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17#include <linux/uaccess.h>
18
17extern void 19extern void
18show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
19 unsigned long *stack, unsigned long bp, char *log_lvl); 21 unsigned long *stack, unsigned long bp, char *log_lvl);
@@ -29,4 +31,26 @@ struct stack_frame {
29 struct stack_frame *next_frame; 31 struct stack_frame *next_frame;
30 unsigned long return_address; 32 unsigned long return_address;
31}; 33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
32#endif 51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index dce99abb449..272c9f1f05f 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -120,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
120{ 120{
121#ifdef CONFIG_FRAME_POINTER 121#ifdef CONFIG_FRAME_POINTER
122 struct stack_frame *frame = (struct stack_frame *)bp; 122 struct stack_frame *frame = (struct stack_frame *)bp;
123 unsigned long next;
123 124
124 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) 125 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
125 return (unsigned long)frame->next_frame; 126 if (!probe_kernel_address(&frame->next_frame, next))
127 return next;
128 else
129 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
130 "callchain\n", &frame->next_frame);
131 }
126#endif 132#endif
127 return bp; 133 return bp;
128} 134}
@@ -202,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
202 if (in_irq_stack(stack, irq_stack, irq_stack_end)) { 208 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
203 if (ops->stack(data, "IRQ") < 0) 209 if (ops->stack(data, "IRQ") < 0)
204 break; 210 break;
205 bp = print_context_stack(tinfo, stack, bp, 211 bp = ops->walk_stack(tinfo, stack, bp,
206 ops, data, irq_stack_end, &graph); 212 ops, data, irq_stack_end, &graph);
207 /* 213 /*
208 * We link to the next stack (which would be 214 * We link to the next stack (which would be
@@ -223,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
223 /* 229 /*
224 * This handles the process stack: 230 * This handles the process stack:
225 */ 231 */
226 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); 232 bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
227 put_cpu(); 233 put_cpu();
228} 234}
229EXPORT_SYMBOL(dump_trace); 235EXPORT_SYMBOL(dump_trace);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a966b753e49..7bca3c6a02f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h> 15#include <linux/pfn.h>
21#include <linux/suspend.h> 16#include <linux/suspend.h>
22#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
23 18
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h> 19#include <asm/e820.h>
27#include <asm/proto.h> 20#include <asm/proto.h>
28#include <asm/setup.h> 21#include <asm/setup.h>
29#include <asm/trampoline.h>
30 22
31/* 23/*
32 * The e820 map is the map that gets modified e.g. with command line parameters 24 * The e820 map is the map that gets modified e.g. with command line parameters
@@ -527,29 +519,45 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
527 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", 519 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
528 (unsigned long long) start, 520 (unsigned long long) start,
529 (unsigned long long) end); 521 (unsigned long long) end);
530 e820_print_type(old_type); 522 if (checktype)
523 e820_print_type(old_type);
531 printk(KERN_CONT "\n"); 524 printk(KERN_CONT "\n");
532 525
533 for (i = 0; i < e820.nr_map; i++) { 526 for (i = 0; i < e820.nr_map; i++) {
534 struct e820entry *ei = &e820.map[i]; 527 struct e820entry *ei = &e820.map[i];
535 u64 final_start, final_end; 528 u64 final_start, final_end;
529 u64 ei_end;
536 530
537 if (checktype && ei->type != old_type) 531 if (checktype && ei->type != old_type)
538 continue; 532 continue;
533
534 ei_end = ei->addr + ei->size;
539 /* totally covered? */ 535 /* totally covered? */
540 if (ei->addr >= start && 536 if (ei->addr >= start && ei_end <= end) {
541 (ei->addr + ei->size) <= (start + size)) {
542 real_removed_size += ei->size; 537 real_removed_size += ei->size;
543 memset(ei, 0, sizeof(struct e820entry)); 538 memset(ei, 0, sizeof(struct e820entry));
544 continue; 539 continue;
545 } 540 }
541
542 /* new range is totally covered? */
543 if (ei->addr < start && ei_end > end) {
544 e820_add_region(end, ei_end - end, ei->type);
545 ei->size = start - ei->addr;
546 real_removed_size += size;
547 continue;
548 }
549
546 /* partially covered */ 550 /* partially covered */
547 final_start = max(start, ei->addr); 551 final_start = max(start, ei->addr);
548 final_end = min(start + size, ei->addr + ei->size); 552 final_end = min(end, ei_end);
549 if (final_start >= final_end) 553 if (final_start >= final_end)
550 continue; 554 continue;
551 real_removed_size += final_end - final_start; 555 real_removed_size += final_end - final_start;
552 556
557 /*
558 * left range could be head or tail, so need to update
559 * size at first.
560 */
553 ei->size -= final_end - final_start; 561 ei->size -= final_end - final_start;
554 if (ei->addr < final_start) 562 if (ei->addr < final_start)
555 continue; 563 continue;
@@ -730,319 +738,44 @@ core_initcall(e820_mark_nvs_memory);
730#endif 738#endif
731 739
732/* 740/*
733 * Early reserved memory areas. 741 * Find a free area with specified alignment in a specific range.
734 */
735#define MAX_EARLY_RES 32
736
737struct early_res {
738 u64 start, end;
739 char name[16];
740 char overlap_ok;
741};
742static struct early_res early_res[MAX_EARLY_RES] __initdata = {
743 { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */
744#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE)
745 /*
746 * But first pinch a few for the stack/trampoline stuff
747 * FIXME: Don't need the extra page at 4K, but need to fix
748 * trampoline before removing it. (see the GDT stuff)
749 */
750 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 },
751#endif
752
753 {}
754};
755
756static int __init find_overlapped_early(u64 start, u64 end)
757{
758 int i;
759 struct early_res *r;
760
761 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
762 r = &early_res[i];
763 if (end > r->start && start < r->end)
764 break;
765 }
766
767 return i;
768}
769
770/*
771 * Drop the i-th range from the early reservation map,
772 * by copying any higher ranges down one over it, and
773 * clearing what had been the last slot.
774 */
775static void __init drop_range(int i)
776{
777 int j;
778
779 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
780 ;
781
782 memmove(&early_res[i], &early_res[i + 1],
783 (j - 1 - i) * sizeof(struct early_res));
784
785 early_res[j - 1].end = 0;
786}
787
788/*
789 * Split any existing ranges that:
790 * 1) are marked 'overlap_ok', and
791 * 2) overlap with the stated range [start, end)
792 * into whatever portion (if any) of the existing range is entirely
793 * below or entirely above the stated range. Drop the portion
794 * of the existing range that overlaps with the stated range,
795 * which will allow the caller of this routine to then add that
796 * stated range without conflicting with any existing range.
797 */ 742 */
798static void __init drop_overlaps_that_are_ok(u64 start, u64 end) 743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
799{ 744{
800 int i; 745 int i;
801 struct early_res *r;
802 u64 lower_start, lower_end;
803 u64 upper_start, upper_end;
804 char name[16];
805 746
806 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { 747 for (i = 0; i < e820.nr_map; i++) {
807 r = &early_res[i]; 748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
808 751
809 /* Continue past non-overlapping ranges */ 752 if (ei->type != E820_RAM)
810 if (end <= r->start || start >= r->end)
811 continue; 753 continue;
812 754
813 /* 755 ei_last = ei->addr + ei->size;
814 * Leave non-ok overlaps as is; let caller 756 ei_start = ei->addr;
815 * panic "Overlapping early reservations" 757 addr = find_early_area(ei_start, ei_last, start, end,
816 * when it hits this overlap. 758 size, align);
817 */
818 if (!r->overlap_ok)
819 return;
820
821 /*
822 * We have an ok overlap. We will drop it from the early
823 * reservation map, and add back in any non-overlapping
824 * portions (lower or upper) as separate, overlap_ok,
825 * non-overlapping ranges.
826 */
827
828 /* 1. Note any non-overlapping (lower or upper) ranges. */
829 strncpy(name, r->name, sizeof(name) - 1);
830
831 lower_start = lower_end = 0;
832 upper_start = upper_end = 0;
833 if (r->start < start) {
834 lower_start = r->start;
835 lower_end = start;
836 }
837 if (r->end > end) {
838 upper_start = end;
839 upper_end = r->end;
840 }
841
842 /* 2. Drop the original ok overlapping range */
843 drop_range(i);
844
845 i--; /* resume for-loop on copied down entry */
846
847 /* 3. Add back in any non-overlapping ranges. */
848 if (lower_end)
849 reserve_early_overlap_ok(lower_start, lower_end, name);
850 if (upper_end)
851 reserve_early_overlap_ok(upper_start, upper_end, name);
852 }
853}
854
855static void __init __reserve_early(u64 start, u64 end, char *name,
856 int overlap_ok)
857{
858 int i;
859 struct early_res *r;
860
861 i = find_overlapped_early(start, end);
862 if (i >= MAX_EARLY_RES)
863 panic("Too many early reservations");
864 r = &early_res[i];
865 if (r->end)
866 panic("Overlapping early reservations "
867 "%llx-%llx %s to %llx-%llx %s\n",
868 start, end - 1, name?name:"", r->start,
869 r->end - 1, r->name);
870 r->start = start;
871 r->end = end;
872 r->overlap_ok = overlap_ok;
873 if (name)
874 strncpy(r->name, name, sizeof(r->name) - 1);
875}
876
877/*
878 * A few early reservtations come here.
879 *
880 * The 'overlap_ok' in the name of this routine does -not- mean it
881 * is ok for these reservations to overlap an earlier reservation.
882 * Rather it means that it is ok for subsequent reservations to
883 * overlap this one.
884 *
885 * Use this entry point to reserve early ranges when you are doing
886 * so out of "Paranoia", reserving perhaps more memory than you need,
887 * just in case, and don't mind a subsequent overlapping reservation
888 * that is known to be needed.
889 *
890 * The drop_overlaps_that_are_ok() call here isn't really needed.
891 * It would be needed if we had two colliding 'overlap_ok'
892 * reservations, so that the second such would not panic on the
893 * overlap with the first. We don't have any such as of this
894 * writing, but might as well tolerate such if it happens in
895 * the future.
896 */
897void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
898{
899 drop_overlaps_that_are_ok(start, end);
900 __reserve_early(start, end, name, 1);
901}
902
903/*
904 * Most early reservations come here.
905 *
906 * We first have drop_overlaps_that_are_ok() drop any pre-existing
907 * 'overlap_ok' ranges, so that we can then reserve this memory
908 * range without risk of panic'ing on an overlapping overlap_ok
909 * early reservation.
910 */
911void __init reserve_early(u64 start, u64 end, char *name)
912{
913 if (start >= end)
914 return;
915
916 drop_overlaps_that_are_ok(start, end);
917 __reserve_early(start, end, name, 0);
918}
919
920void __init free_early(u64 start, u64 end)
921{
922 struct early_res *r;
923 int i;
924
925 i = find_overlapped_early(start, end);
926 r = &early_res[i];
927 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
928 panic("free_early on not reserved area: %llx-%llx!",
929 start, end - 1);
930
931 drop_range(i);
932}
933 759
934void __init early_res_to_bootmem(u64 start, u64 end) 760 if (addr != -1ULL)
935{ 761 return addr;
936 int i, count;
937 u64 final_start, final_end;
938
939 count = 0;
940 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
941 count++;
942
943 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
944 count, start, end);
945 for (i = 0; i < count; i++) {
946 struct early_res *r = &early_res[i];
947 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
948 r->start, r->end, r->name);
949 final_start = max(start, r->start);
950 final_end = min(end, r->end);
951 if (final_start >= final_end) {
952 printk(KERN_CONT "\n");
953 continue;
954 }
955 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
956 final_start, final_end);
957 reserve_bootmem_generic(final_start, final_end - final_start,
958 BOOTMEM_DEFAULT);
959 } 762 }
763 return -1ULL;
960} 764}
961 765
962/* Check for already reserved areas */ 766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
963static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
964{
965 int i;
966 u64 addr = *addrp;
967 int changed = 0;
968 struct early_res *r;
969again:
970 i = find_overlapped_early(addr, addr + size);
971 r = &early_res[i];
972 if (i < MAX_EARLY_RES && r->end) {
973 *addrp = addr = round_up(r->end, align);
974 changed = 1;
975 goto again;
976 }
977 return changed;
978}
979
980/* Check for already reserved areas */
981static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
982{ 767{
983 int i; 768 return find_e820_area(start, end, size, align);
984 u64 addr = *addrp, last;
985 u64 size = *sizep;
986 int changed = 0;
987again:
988 last = addr + size;
989 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
990 struct early_res *r = &early_res[i];
991 if (last > r->start && addr < r->start) {
992 size = r->start - addr;
993 changed = 1;
994 goto again;
995 }
996 if (last > r->end && addr < r->end) {
997 addr = round_up(r->end, align);
998 size = last - addr;
999 changed = 1;
1000 goto again;
1001 }
1002 if (last <= r->end && addr >= r->start) {
1003 (*sizep)++;
1004 return 0;
1005 }
1006 }
1007 if (changed) {
1008 *addrp = addr;
1009 *sizep = size;
1010 }
1011 return changed;
1012} 769}
1013 770
1014/* 771u64 __init get_max_mapped(void)
1015 * Find a free area with specified alignment in a specific range.
1016 */
1017u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
1018{ 772{
1019 int i; 773 u64 end = max_pfn_mapped;
1020 774
1021 for (i = 0; i < e820.nr_map; i++) { 775 end <<= PAGE_SHIFT;
1022 struct e820entry *ei = &e820.map[i];
1023 u64 addr, last;
1024 u64 ei_last;
1025 776
1026 if (ei->type != E820_RAM) 777 return end;
1027 continue;
1028 addr = round_up(ei->addr, align);
1029 ei_last = ei->addr + ei->size;
1030 if (addr < start)
1031 addr = round_up(start, align);
1032 if (addr >= ei_last)
1033 continue;
1034 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1035 ;
1036 last = addr + size;
1037 if (last > ei_last)
1038 continue;
1039 if (last > end)
1040 continue;
1041 return addr;
1042 }
1043 return -1ULL;
1044} 778}
1045
1046/* 779/*
1047 * Find next free range after *start 780 * Find next free range after *start
1048 */ 781 */
@@ -1052,25 +785,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1052 785
1053 for (i = 0; i < e820.nr_map; i++) { 786 for (i = 0; i < e820.nr_map; i++) {
1054 struct e820entry *ei = &e820.map[i]; 787 struct e820entry *ei = &e820.map[i];
1055 u64 addr, last; 788 u64 addr;
1056 u64 ei_last; 789 u64 ei_start, ei_last;
1057 790
1058 if (ei->type != E820_RAM) 791 if (ei->type != E820_RAM)
1059 continue; 792 continue;
1060 addr = round_up(ei->addr, align); 793
1061 ei_last = ei->addr + ei->size; 794 ei_last = ei->addr + ei->size;
1062 if (addr < start) 795 ei_start = ei->addr;
1063 addr = round_up(start, align); 796 addr = find_early_area_size(ei_start, ei_last, start,
1064 if (addr >= ei_last) 797 sizep, align);
1065 continue; 798
1066 *sizep = ei_last - addr; 799 if (addr != -1ULL)
1067 while (bad_addr_size(&addr, sizep, align) && 800 return addr;
1068 addr + *sizep <= ei_last)
1069 ;
1070 last = addr + *sizep;
1071 if (last > ei_last)
1072 continue;
1073 return addr;
1074 } 801 }
1075 802
1076 return -1ULL; 803 return -1ULL;
@@ -1429,6 +1156,8 @@ void __init e820_reserve_resources_late(void)
1429 end = MAX_RESOURCE_SIZE; 1156 end = MAX_RESOURCE_SIZE;
1430 if (start >= end) 1157 if (start >= end)
1431 continue; 1158 continue;
1159 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1160 start, end);
1432 reserve_region_with_split(&iomem_resource, start, end, 1161 reserve_region_with_split(&iomem_resource, start, end,
1433 "RAM buffer"); 1162 "RAM buffer");
1434 } 1163 }
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 5051b94c906..b2e24603739 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h>
10 11
11#include <asm/setup.h> 12#include <asm/setup.h>
12#include <asm/sections.h> 13#include <asm/sections.h>
@@ -29,14 +30,25 @@ static void __init i386_default_early_setup(void)
29 30
30void __init i386_start_kernel(void) 31void __init i386_start_kernel(void)
31{ 32{
33#ifdef CONFIG_X86_TRAMPOLINE
34 /*
35 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff)
38 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
40 "EX TRAMPOLINE");
41#endif
42
32 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
33 44
34#ifdef CONFIG_BLK_DEV_INITRD 45#ifdef CONFIG_BLK_DEV_INITRD
35 /* Reserve INITRD */ 46 /* Reserve INITRD */
36 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 47 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
48 /* Assume only end is not page aligned */
37 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 49 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
38 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 50 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
39 u64 ramdisk_end = ramdisk_image + ramdisk_size; 51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
40 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
41 } 53 }
42#endif 54#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b5a9896ca1e..7147143fd61 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data)
103#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */ 104 /* Reserve INITRD */
105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
106 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
107 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
108 unsigned long ramdisk_end = ramdisk_image + ramdisk_size; 109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
109 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
110 } 111 }
111#endif 112#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7fd318bac59..37c3d4b17d8 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -442,8 +442,8 @@ is386: movl $2,%ecx # set MP
442 */ 442 */
443 cmpb $0,ready 443 cmpb $0,ready
444 jne 1f 444 jne 1f
445 movl $per_cpu__gdt_page,%eax 445 movl $gdt_page,%eax
446 movl $per_cpu__stack_canary,%ecx 446 movl $stack_canary,%ecx
447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
448 shrl $16, %ecx 448 shrl $16, %ecx
449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -706,7 +706,7 @@ idt_descr:
706 .word 0 # 32 bit align gdt_desc.address 706 .word 0 # 32 bit align gdt_desc.address
707ENTRY(early_gdt_descr) 707ENTRY(early_gdt_descr)
708 .word GDT_ENTRIES*8-1 708 .word GDT_ENTRIES*8-1
709 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ 709 .long gdt_page /* Overwritten for secondary CPUs */
710 710
711/* 711/*
712 * The boot_gdt must mirror the equivalent in setup.S and is 712 * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2d8b5035371..3d1e6f16b7a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
27#define GET_CR2_INTO_RCX movq %cr2, %rcx 27#define GET_CR2_INTO_RCX movq %cr2, %rcx
28#endif 28#endif
29 29
30/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
31 * because we need identity-mapped pages. 31 * because we need identity-mapped pages.
32 * 32 *
33 */ 33 */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad80a1c718c..23b4ecdffa9 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
4#include <linux/sysdev.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/slab.h>
7#include <linux/hpet.h> 8#include <linux/hpet.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/cpu.h> 10#include <linux/cpu.h>
@@ -266,7 +267,7 @@ static void hpet_resume_device(void)
266 force_hpet_resume(); 267 force_hpet_resume();
267} 268}
268 269
269static void hpet_resume_counter(void) 270static void hpet_resume_counter(struct clocksource *cs)
270{ 271{
271 hpet_resume_device(); 272 hpet_resume_device();
272 hpet_restart_counter(); 273 hpet_restart_counter();
@@ -399,9 +400,15 @@ static int hpet_next_event(unsigned long delta,
399 * then we might have a real hardware problem. We can not do 400 * then we might have a real hardware problem. We can not do
400 * much about it here, but at least alert the user/admin with 401 * much about it here, but at least alert the user/admin with
401 * a prominent warning. 402 * a prominent warning.
403 * An erratum on some chipsets (ICH9,..), results in comparator read
404 * immediately following a write returning old value. Workaround
405 * for this is to read this value second time, when first
406 * read returns old value.
402 */ 407 */
403 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, 408 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
409 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
404 KERN_WARNING "hpet: compare register read back failed.\n"); 410 KERN_WARNING "hpet: compare register read back failed.\n");
411 }
405 412
406 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 413 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
407} 414}
@@ -1143,6 +1150,7 @@ int hpet_set_periodic_freq(unsigned long freq)
1143 do_div(clc, freq); 1150 do_div(clc, freq);
1144 clc >>= hpet_clockevent.shift; 1151 clc >>= hpet_clockevent.shift;
1145 hpet_pie_delta = clc; 1152 hpet_pie_delta = clc;
1153 hpet_pie_limit = 0;
1146 } 1154 }
1147 return 1; 1155 return 1;
1148} 1156}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index dca2802c666..d6cc065f519 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -344,13 +344,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
344 } 344 }
345 345
346 /* 346 /*
347 * For kernel-addresses, either the address or symbol name can be
348 * specified.
349 */
350 if (info->name)
351 info->address = (unsigned long)
352 kallsyms_lookup_name(info->name);
353 /*
354 * Check that the low-order bits of the address are appropriate 347 * Check that the low-order bits of the address are appropriate
355 * for the alignment implied by len. 348 * for the alignment implied by len.
356 */ 349 */
@@ -535,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp)
535{ 528{
536 /* TODO */ 529 /* TODO */
537} 530}
538
539void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
540{
541 /* TODO */
542}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index c01a2b846d4..54c31c28548 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/regset.h> 9#include <linux/regset.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
11 12
12#include <asm/sigcontext.h> 13#include <asm/sigcontext.h>
13#include <asm/processor.h> 14#include <asm/processor.h>
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef8..7c9f02c130f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/init.h> 9#include <linux/init.h>
11#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
@@ -32,8 +31,14 @@
32 */ 31 */
33 32
34static int i8259A_auto_eoi; 33static int i8259A_auto_eoi;
35DEFINE_SPINLOCK(i8259A_lock); 34DEFINE_RAW_SPINLOCK(i8259A_lock);
36static void mask_and_ack_8259A(unsigned int); 35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
37 42
38struct irq_chip i8259A_chip = { 43struct irq_chip i8259A_chip = {
39 .name = "XT-PIC", 44 .name = "XT-PIC",
@@ -63,51 +68,51 @@ unsigned int cached_irq_mask = 0xffff;
63 */ 68 */
64unsigned long io_apic_irqs; 69unsigned long io_apic_irqs;
65 70
66void disable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(unsigned int irq)
67{ 72{
68 unsigned int mask = 1 << irq; 73 unsigned int mask = 1 << irq;
69 unsigned long flags; 74 unsigned long flags;
70 75
71 spin_lock_irqsave(&i8259A_lock, flags); 76 raw_spin_lock_irqsave(&i8259A_lock, flags);
72 cached_irq_mask |= mask; 77 cached_irq_mask |= mask;
73 if (irq & 8) 78 if (irq & 8)
74 outb(cached_slave_mask, PIC_SLAVE_IMR); 79 outb(cached_slave_mask, PIC_SLAVE_IMR);
75 else 80 else
76 outb(cached_master_mask, PIC_MASTER_IMR); 81 outb(cached_master_mask, PIC_MASTER_IMR);
77 spin_unlock_irqrestore(&i8259A_lock, flags); 82 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
78} 83}
79 84
80void enable_8259A_irq(unsigned int irq) 85static void enable_8259A_irq(unsigned int irq)
81{ 86{
82 unsigned int mask = ~(1 << irq); 87 unsigned int mask = ~(1 << irq);
83 unsigned long flags; 88 unsigned long flags;
84 89
85 spin_lock_irqsave(&i8259A_lock, flags); 90 raw_spin_lock_irqsave(&i8259A_lock, flags);
86 cached_irq_mask &= mask; 91 cached_irq_mask &= mask;
87 if (irq & 8) 92 if (irq & 8)
88 outb(cached_slave_mask, PIC_SLAVE_IMR); 93 outb(cached_slave_mask, PIC_SLAVE_IMR);
89 else 94 else
90 outb(cached_master_mask, PIC_MASTER_IMR); 95 outb(cached_master_mask, PIC_MASTER_IMR);
91 spin_unlock_irqrestore(&i8259A_lock, flags); 96 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
92} 97}
93 98
94int i8259A_irq_pending(unsigned int irq) 99static int i8259A_irq_pending(unsigned int irq)
95{ 100{
96 unsigned int mask = 1<<irq; 101 unsigned int mask = 1<<irq;
97 unsigned long flags; 102 unsigned long flags;
98 int ret; 103 int ret;
99 104
100 spin_lock_irqsave(&i8259A_lock, flags); 105 raw_spin_lock_irqsave(&i8259A_lock, flags);
101 if (irq < 8) 106 if (irq < 8)
102 ret = inb(PIC_MASTER_CMD) & mask; 107 ret = inb(PIC_MASTER_CMD) & mask;
103 else 108 else
104 ret = inb(PIC_SLAVE_CMD) & (mask >> 8); 109 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
105 spin_unlock_irqrestore(&i8259A_lock, flags); 110 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
106 111
107 return ret; 112 return ret;
108} 113}
109 114
110void make_8259A_irq(unsigned int irq) 115static void make_8259A_irq(unsigned int irq)
111{ 116{
112 disable_irq_nosync(irq); 117 disable_irq_nosync(irq);
113 io_apic_irqs &= ~(1<<irq); 118 io_apic_irqs &= ~(1<<irq);
@@ -150,7 +155,7 @@ static void mask_and_ack_8259A(unsigned int irq)
150 unsigned int irqmask = 1 << irq; 155 unsigned int irqmask = 1 << irq;
151 unsigned long flags; 156 unsigned long flags;
152 157
153 spin_lock_irqsave(&i8259A_lock, flags); 158 raw_spin_lock_irqsave(&i8259A_lock, flags);
154 /* 159 /*
155 * Lightweight spurious IRQ detection. We do not want 160 * Lightweight spurious IRQ detection. We do not want
156 * to overdo spurious IRQ handling - it's usually a sign 161 * to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +188,7 @@ handle_real_irq:
183 outb(cached_master_mask, PIC_MASTER_IMR); 188 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ 189 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
185 } 190 }
186 spin_unlock_irqrestore(&i8259A_lock, flags); 191 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
187 return; 192 return;
188 193
189spurious_8259A_irq: 194spurious_8259A_irq:
@@ -281,37 +286,37 @@ static int __init i8259A_init_sysfs(void)
281 286
282device_initcall(i8259A_init_sysfs); 287device_initcall(i8259A_init_sysfs);
283 288
284void mask_8259A(void) 289static void mask_8259A(void)
285{ 290{
286 unsigned long flags; 291 unsigned long flags;
287 292
288 spin_lock_irqsave(&i8259A_lock, flags); 293 raw_spin_lock_irqsave(&i8259A_lock, flags);
289 294
290 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 295 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 296 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 297
293 spin_unlock_irqrestore(&i8259A_lock, flags); 298 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
294} 299}
295 300
296void unmask_8259A(void) 301static void unmask_8259A(void)
297{ 302{
298 unsigned long flags; 303 unsigned long flags;
299 304
300 spin_lock_irqsave(&i8259A_lock, flags); 305 raw_spin_lock_irqsave(&i8259A_lock, flags);
301 306
302 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 307 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
303 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 308 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
304 309
305 spin_unlock_irqrestore(&i8259A_lock, flags); 310 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
306} 311}
307 312
308void init_8259A(int auto_eoi) 313static void init_8259A(int auto_eoi)
309{ 314{
310 unsigned long flags; 315 unsigned long flags;
311 316
312 i8259A_auto_eoi = auto_eoi; 317 i8259A_auto_eoi = auto_eoi;
313 318
314 spin_lock_irqsave(&i8259A_lock, flags); 319 raw_spin_lock_irqsave(&i8259A_lock, flags);
315 320
316 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 321 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
317 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 322 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +361,49 @@ void init_8259A(int auto_eoi)
356 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 361 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
357 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 362 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
358 363
359 spin_unlock_irqrestore(&i8259A_lock, flags); 364 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
360} 365}
366
367/*
368 * make i8259 a driver so that we can select pic functions at run time. the goal
369 * is to make x86 binary compatible among pc compatible and non-pc compatible
370 * platforms, such as x86 MID.
371 */
372
373static void legacy_pic_noop(void) { };
374static void legacy_pic_uint_noop(unsigned int unused) { };
375static void legacy_pic_int_noop(int unused) { };
376
377static struct irq_chip dummy_pic_chip = {
378 .name = "dummy pic",
379 .mask = legacy_pic_uint_noop,
380 .unmask = legacy_pic_uint_noop,
381 .disable = legacy_pic_uint_noop,
382 .mask_ack = legacy_pic_uint_noop,
383};
384static int legacy_pic_irq_pending_noop(unsigned int irq)
385{
386 return 0;
387}
388
389struct legacy_pic null_legacy_pic = {
390 .nr_legacy_irqs = 0,
391 .chip = &dummy_pic_chip,
392 .mask_all = legacy_pic_noop,
393 .restore_mask = legacy_pic_noop,
394 .init = legacy_pic_int_noop,
395 .irq_pending = legacy_pic_irq_pending_noop,
396 .make_irq = legacy_pic_uint_noop,
397};
398
399struct legacy_pic default_legacy_pic = {
400 .nr_legacy_irqs = NR_IRQS_LEGACY,
401 .chip = &i8259A_chip,
402 .mask_all = mask_8259A,
403 .restore_mask = unmask_8259A,
404 .init = init_8259A,
405 .irq_pending = i8259A_irq_pending,
406 .make_irq = make_8259A_irq,
407};
408
409struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index d5932226614..0ed2d300cd4 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/kprobes.h> 9#include <linux/kprobes.h>
11#include <linux/init.h> 10#include <linux/init.h>
@@ -84,24 +83,7 @@ static struct irqaction irq2 = {
84}; 83};
85 84
86DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 85DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
87 [0 ... IRQ0_VECTOR - 1] = -1, 86 [0 ... NR_VECTORS - 1] = -1,
88 [IRQ0_VECTOR] = 0,
89 [IRQ1_VECTOR] = 1,
90 [IRQ2_VECTOR] = 2,
91 [IRQ3_VECTOR] = 3,
92 [IRQ4_VECTOR] = 4,
93 [IRQ5_VECTOR] = 5,
94 [IRQ6_VECTOR] = 6,
95 [IRQ7_VECTOR] = 7,
96 [IRQ8_VECTOR] = 8,
97 [IRQ9_VECTOR] = 9,
98 [IRQ10_VECTOR] = 10,
99 [IRQ11_VECTOR] = 11,
100 [IRQ12_VECTOR] = 12,
101 [IRQ13_VECTOR] = 13,
102 [IRQ14_VECTOR] = 14,
103 [IRQ15_VECTOR] = 15,
104 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
105}; 87};
106 88
107int vector_used_by_percpu_irq(unsigned int vector) 89int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +105,12 @@ void __init init_ISA_irqs(void)
123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
124 init_bsp_APIC(); 106 init_bsp_APIC();
125#endif 107#endif
126 init_8259A(0); 108 legacy_pic->init(0);
127 109
128 /* 110 /*
129 * 16 old-style INTA-cycle interrupts: 111 * 16 old-style INTA-cycle interrupts:
130 */ 112 */
131 for (i = 0; i < NR_IRQS_LEGACY; i++) { 113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
132 struct irq_desc *desc = irq_to_desc(i); 114 struct irq_desc *desc = irq_to_desc(i);
133 115
134 desc->status = IRQ_DISABLED; 116 desc->status = IRQ_DISABLED;
@@ -142,9 +124,44 @@ void __init init_ISA_irqs(void)
142 124
143void __init init_IRQ(void) 125void __init init_IRQ(void)
144{ 126{
127 int i;
128
129 /*
130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
132 * then this configuration will likely be static after the boot. If
133 * these IRQ's are handled by more mordern controllers like IO-APIC,
134 * then this vector space can be freed and re-used dynamically as the
135 * irq's migrate etc.
136 */
137 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
138 per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
139
145 x86_init.irqs.intr_init(); 140 x86_init.irqs.intr_init();
146} 141}
147 142
143/*
144 * Setup the vector to irq mappings.
145 */
146void setup_vector_irq(int cpu)
147{
148#ifndef CONFIG_X86_IO_APIC
149 int irq;
150
151 /*
152 * On most of the platforms, legacy PIC delivers the interrupts on the
153 * boot cpu. But there are certain platforms where PIC interrupts are
154 * delivered to multiple cpu's. If the legacy IRQ is handled by the
155 * legacy PIC, for the new cpu that is coming online, setup the static
156 * legacy vector to irq mapping:
157 */
158 for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
159 per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
160#endif
161
162 __setup_vector_irq(cpu);
163}
164
148static void __init smp_intr_init(void) 165static void __init smp_intr_init(void)
149{ 166{
150#ifdef CONFIG_SMP 167#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index cbc4332a77b..0f7bc20cfcd 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -2,8 +2,8 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/module.h> 9#include <linux/module.h>
@@ -121,3 +121,17 @@ void k8_flush_garts(void)
121} 121}
122EXPORT_SYMBOL_GPL(k8_flush_garts); 122EXPORT_SYMBOL_GPL(k8_flush_garts);
123 123
124static __init int init_k8_nbs(void)
125{
126 int err = 0;
127
128 err = cache_k8_northbridges();
129
130 if (err < 0)
131 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
132
133 return err;
134}
135
136/* This has to go after the PCI subsystem */
137fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index e444357375c..8afd9f321f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/stat.h> 14#include <linux/stat.h>
14#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index bfba6019d76..b2258ca9100 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -618,8 +618,8 @@ int kgdb_arch_init(void)
618 * portion of kgdb because this operation requires mutexs to 618 * portion of kgdb because this operation requires mutexs to
619 * complete. 619 * complete.
620 */ 620 */
621 hw_breakpoint_init(&attr);
621 attr.bp_addr = (unsigned long)kgdb_arch_init; 622 attr.bp_addr = (unsigned long)kgdb_arch_init;
622 attr.type = PERF_TYPE_BREAKPOINT;
623 attr.bp_len = HW_BREAKPOINT_LEN_1; 623 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W; 624 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1; 625 attr.disabled = 1;
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 5de9f4a9c3f..b43bbaebe2c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -49,6 +49,7 @@
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h> 51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
52 53
53#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
@@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
106}; 107};
107const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
108 109
109/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
110static void __kprobes set_jmp_op(void *from, void *to)
111{ 111{
112 struct __arch_jmp_op { 112 struct __arch_relative_insn {
113 char op; 113 u8 op;
114 s32 raddr; 114 s32 raddr;
115 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
116 jop = (struct __arch_jmp_op *)from; 116
117 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
118 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
119} 126}
120 127
121/* 128/*
@@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
202 /* 209 /*
203 * Basically, kp->ainsn.insn has an original instruction. 210 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping 211 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of 212 * at different place, __copy_instruction() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction 213 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn. 214 * from the kp->ainsn.insn.
208 * 215 *
@@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
284} 291}
285 292
286/* 293/*
287 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
288 * addressing mode. 295 * uses the %rip-relative addressing mode.
289 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
290 * If not, return null. 297 * If not, return null.
291 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
292 */ 299 */
293static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
294{ 301{
295#ifdef CONFIG_X86_64
296 struct insn insn; 302 struct insn insn;
297 kernel_insn_init(&insn, p->ainsn.insn); 303 int ret;
304 kprobe_opcode_t buf[MAX_INSN_SIZE];
298 305
306 kernel_insn_init(&insn, src);
307 if (recover) {
308 insn_get_opcode(&insn);
309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
310 ret = recover_probed_instruction(buf,
311 (unsigned long)src);
312 if (ret)
313 return 0;
314 kernel_insn_init(&insn, buf);
315 }
316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
319
320#ifdef CONFIG_X86_64
299 if (insn_rip_relative(&insn)) { 321 if (insn_rip_relative(&insn)) {
300 s64 newdisp; 322 s64 newdisp;
301 u8 *disp; 323 u8 *disp;
324 kernel_insn_init(&insn, dest);
302 insn_get_displacement(&insn); 325 insn_get_displacement(&insn);
303 /* 326 /*
304 * The copied instruction uses the %rip-relative addressing 327 * The copied instruction uses the %rip-relative addressing
@@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p)
312 * extension of the original signed 32-bit displacement would 335 * extension of the original signed 32-bit displacement would
313 * have given. 336 * have given.
314 */ 337 */
315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value - 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
316 (u8 *) p->ainsn.insn; 339 (u8 *) dest;
317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
319 *(s32 *) disp = (s32) newdisp; 342 *(s32 *) disp = (s32) newdisp;
320 } 343 }
321#endif 344#endif
345 return insn.length;
322} 346}
323 347
324static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
325{ 349{
326 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
327 351 * Copy an instruction without recovering int3, because it will be
328 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
329 355
330 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
331 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -406,18 +432,6 @@ static void __kprobes restore_btf(void)
406 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
407} 433}
408 434
409static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
410{
411 clear_btf();
412 regs->flags |= X86_EFLAGS_TF;
413 regs->flags &= ~X86_EFLAGS_IF;
414 /* single step inline if the instruction is an int3 */
415 if (p->opcode == BREAKPOINT_INSTRUCTION)
416 regs->ip = (unsigned long)p->addr;
417 else
418 regs->ip = (unsigned long)p->ainsn.insn;
419}
420
421void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
422 struct pt_regs *regs) 436 struct pt_regs *regs)
423{ 437{
@@ -429,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
429 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
430} 444}
431 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
432static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
433 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
434{ 456{
457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
435#if !defined(CONFIG_PREEMPT) 460#if !defined(CONFIG_PREEMPT)
436 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
437 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
438 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
439 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
440 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
441 return; 472 return;
442 } 473 }
443#endif 474#endif
444 prepare_singlestep(p, regs); 475 if (reenter) {
445 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
446} 490}
447 491
448/* 492/*
@@ -456,11 +500,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
456 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
457 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
458 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
459 save_previous_kprobe(kcb);
460 set_current_kprobe(p, regs, kcb);
461 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
462 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
463 kcb->kprobe_status = KPROBE_REENTER;
464 break; 505 break;
465 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
466 /* A probe has been hit in the codepath leading up to, or just 507 /* A probe has been hit in the codepath leading up to, or just
@@ -535,13 +576,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
535 * more here. 576 * more here.
536 */ 577 */
537 if (!p->pre_handler || !p->pre_handler(p, regs)) 578 if (!p->pre_handler || !p->pre_handler(p, regs))
538 setup_singlestep(p, regs, kcb); 579 setup_singlestep(p, regs, kcb, 0);
539 return 1; 580 return 1;
540 } 581 }
541 } else if (kprobe_running()) { 582 } else if (kprobe_running()) {
542 p = __get_cpu_var(current_kprobe); 583 p = __get_cpu_var(current_kprobe);
543 if (p->break_handler && p->break_handler(p, regs)) { 584 if (p->break_handler && p->break_handler(p, regs)) {
544 setup_singlestep(p, regs, kcb); 585 setup_singlestep(p, regs, kcb, 0);
545 return 1; 586 return 1;
546 } 587 }
547 } /* else: not a kprobe fault; let the kernel handle it */ 588 } /* else: not a kprobe fault; let the kernel handle it */
@@ -550,6 +591,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
550 return 0; 591 return 0;
551} 592}
552 593
594#ifdef CONFIG_X86_64
595#define SAVE_REGS_STRING \
596 /* Skip cs, ip, orig_ax. */ \
597 " subq $24, %rsp\n" \
598 " pushq %rdi\n" \
599 " pushq %rsi\n" \
600 " pushq %rdx\n" \
601 " pushq %rcx\n" \
602 " pushq %rax\n" \
603 " pushq %r8\n" \
604 " pushq %r9\n" \
605 " pushq %r10\n" \
606 " pushq %r11\n" \
607 " pushq %rbx\n" \
608 " pushq %rbp\n" \
609 " pushq %r12\n" \
610 " pushq %r13\n" \
611 " pushq %r14\n" \
612 " pushq %r15\n"
613#define RESTORE_REGS_STRING \
614 " popq %r15\n" \
615 " popq %r14\n" \
616 " popq %r13\n" \
617 " popq %r12\n" \
618 " popq %rbp\n" \
619 " popq %rbx\n" \
620 " popq %r11\n" \
621 " popq %r10\n" \
622 " popq %r9\n" \
623 " popq %r8\n" \
624 " popq %rax\n" \
625 " popq %rcx\n" \
626 " popq %rdx\n" \
627 " popq %rsi\n" \
628 " popq %rdi\n" \
629 /* Skip orig_ax, ip, cs */ \
630 " addq $24, %rsp\n"
631#else
632#define SAVE_REGS_STRING \
633 /* Skip cs, ip, orig_ax and gs. */ \
634 " subl $16, %esp\n" \
635 " pushl %fs\n" \
636 " pushl %ds\n" \
637 " pushl %es\n" \
638 " pushl %eax\n" \
639 " pushl %ebp\n" \
640 " pushl %edi\n" \
641 " pushl %esi\n" \
642 " pushl %edx\n" \
643 " pushl %ecx\n" \
644 " pushl %ebx\n"
645#define RESTORE_REGS_STRING \
646 " popl %ebx\n" \
647 " popl %ecx\n" \
648 " popl %edx\n" \
649 " popl %esi\n" \
650 " popl %edi\n" \
651 " popl %ebp\n" \
652 " popl %eax\n" \
653 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
654 " addl $24, %esp\n"
655#endif
656
553/* 657/*
554 * When a retprobed function returns, this code saves registers and 658 * When a retprobed function returns, this code saves registers and
555 * calls trampoline_handler() runs, which calls the kretprobe's handler. 659 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -563,65 +667,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
563 /* We don't bother saving the ss register */ 667 /* We don't bother saving the ss register */
564 " pushq %rsp\n" 668 " pushq %rsp\n"
565 " pushfq\n" 669 " pushfq\n"
566 /* 670 SAVE_REGS_STRING
567 * Skip cs, ip, orig_ax.
568 * trampoline_handler() will plug in these values
569 */
570 " subq $24, %rsp\n"
571 " pushq %rdi\n"
572 " pushq %rsi\n"
573 " pushq %rdx\n"
574 " pushq %rcx\n"
575 " pushq %rax\n"
576 " pushq %r8\n"
577 " pushq %r9\n"
578 " pushq %r10\n"
579 " pushq %r11\n"
580 " pushq %rbx\n"
581 " pushq %rbp\n"
582 " pushq %r12\n"
583 " pushq %r13\n"
584 " pushq %r14\n"
585 " pushq %r15\n"
586 " movq %rsp, %rdi\n" 671 " movq %rsp, %rdi\n"
587 " call trampoline_handler\n" 672 " call trampoline_handler\n"
588 /* Replace saved sp with true return address. */ 673 /* Replace saved sp with true return address. */
589 " movq %rax, 152(%rsp)\n" 674 " movq %rax, 152(%rsp)\n"
590 " popq %r15\n" 675 RESTORE_REGS_STRING
591 " popq %r14\n"
592 " popq %r13\n"
593 " popq %r12\n"
594 " popq %rbp\n"
595 " popq %rbx\n"
596 " popq %r11\n"
597 " popq %r10\n"
598 " popq %r9\n"
599 " popq %r8\n"
600 " popq %rax\n"
601 " popq %rcx\n"
602 " popq %rdx\n"
603 " popq %rsi\n"
604 " popq %rdi\n"
605 /* Skip orig_ax, ip, cs */
606 " addq $24, %rsp\n"
607 " popfq\n" 676 " popfq\n"
608#else 677#else
609 " pushf\n" 678 " pushf\n"
610 /* 679 SAVE_REGS_STRING
611 * Skip cs, ip, orig_ax and gs.
612 * trampoline_handler() will plug in these values
613 */
614 " subl $16, %esp\n"
615 " pushl %fs\n"
616 " pushl %es\n"
617 " pushl %ds\n"
618 " pushl %eax\n"
619 " pushl %ebp\n"
620 " pushl %edi\n"
621 " pushl %esi\n"
622 " pushl %edx\n"
623 " pushl %ecx\n"
624 " pushl %ebx\n"
625 " movl %esp, %eax\n" 680 " movl %esp, %eax\n"
626 " call trampoline_handler\n" 681 " call trampoline_handler\n"
627 /* Move flags to cs */ 682 /* Move flags to cs */
@@ -629,15 +684,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
629 " movl %edx, 52(%esp)\n" 684 " movl %edx, 52(%esp)\n"
630 /* Replace saved flags with true return address. */ 685 /* Replace saved flags with true return address. */
631 " movl %eax, 56(%esp)\n" 686 " movl %eax, 56(%esp)\n"
632 " popl %ebx\n" 687 RESTORE_REGS_STRING
633 " popl %ecx\n"
634 " popl %edx\n"
635 " popl %esi\n"
636 " popl %edi\n"
637 " popl %ebp\n"
638 " popl %eax\n"
639 /* Skip ds, es, fs, gs, orig_ax and ip */
640 " addl $24, %esp\n"
641 " popf\n" 688 " popf\n"
642#endif 689#endif
643 " ret\n"); 690 " ret\n");
@@ -805,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p,
805 * These instructions can be executed directly if it 852 * These instructions can be executed directly if it
806 * jumps back to correct address. 853 * jumps back to correct address.
807 */ 854 */
808 set_jmp_op((void *)regs->ip, 855 synthesize_reljump((void *)regs->ip,
809 (void *)orig_ip + (regs->ip - copy_ip)); 856 (void *)orig_ip + (regs->ip - copy_ip));
810 p->ainsn.boostable = 1; 857 p->ainsn.boostable = 1;
811 } else { 858 } else {
812 p->ainsn.boostable = -1; 859 p->ainsn.boostable = -1;
@@ -1033,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1033 return 0; 1080 return 0;
1034} 1081}
1035 1082
1083
1084#ifdef CONFIG_OPTPROBES
1085
1086/* Insert a call instruction at address 'from', which calls address 'to'.*/
1087static void __kprobes synthesize_relcall(void *from, void *to)
1088{
1089 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1090}
1091
1092/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1093static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1094 unsigned long val)
1095{
1096#ifdef CONFIG_X86_64
1097 *addr++ = 0x48;
1098 *addr++ = 0xbf;
1099#else
1100 *addr++ = 0xb8;
1101#endif
1102 *(unsigned long *)addr = val;
1103}
1104
1105void __kprobes kprobes_optinsn_template_holder(void)
1106{
1107 asm volatile (
1108 ".global optprobe_template_entry\n"
1109 "optprobe_template_entry: \n"
1110#ifdef CONFIG_X86_64
1111 /* We don't bother saving the ss register */
1112 " pushq %rsp\n"
1113 " pushfq\n"
1114 SAVE_REGS_STRING
1115 " movq %rsp, %rsi\n"
1116 ".global optprobe_template_val\n"
1117 "optprobe_template_val: \n"
1118 ASM_NOP5
1119 ASM_NOP5
1120 ".global optprobe_template_call\n"
1121 "optprobe_template_call: \n"
1122 ASM_NOP5
1123 /* Move flags to rsp */
1124 " movq 144(%rsp), %rdx\n"
1125 " movq %rdx, 152(%rsp)\n"
1126 RESTORE_REGS_STRING
1127 /* Skip flags entry */
1128 " addq $8, %rsp\n"
1129 " popfq\n"
1130#else /* CONFIG_X86_32 */
1131 " pushf\n"
1132 SAVE_REGS_STRING
1133 " movl %esp, %edx\n"
1134 ".global optprobe_template_val\n"
1135 "optprobe_template_val: \n"
1136 ASM_NOP5
1137 ".global optprobe_template_call\n"
1138 "optprobe_template_call: \n"
1139 ASM_NOP5
1140 RESTORE_REGS_STRING
1141 " addl $4, %esp\n" /* skip cs */
1142 " popf\n"
1143#endif
1144 ".global optprobe_template_end\n"
1145 "optprobe_template_end: \n");
1146}
1147
1148#define TMPL_MOVE_IDX \
1149 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1150#define TMPL_CALL_IDX \
1151 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1152#define TMPL_END_IDX \
1153 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1154
1155#define INT3_SIZE sizeof(kprobe_opcode_t)
1156
1157/* Optimized kprobe call back function: called from optinsn */
1158static void __kprobes optimized_callback(struct optimized_kprobe *op,
1159 struct pt_regs *regs)
1160{
1161 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1162
1163 preempt_disable();
1164 if (kprobe_running()) {
1165 kprobes_inc_nmissed_count(&op->kp);
1166 } else {
1167 /* Save skipped registers */
1168#ifdef CONFIG_X86_64
1169 regs->cs = __KERNEL_CS;
1170#else
1171 regs->cs = __KERNEL_CS | get_kernel_rpl();
1172 regs->gs = 0;
1173#endif
1174 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1175 regs->orig_ax = ~0UL;
1176
1177 __get_cpu_var(current_kprobe) = &op->kp;
1178 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1179 opt_pre_handler(&op->kp, regs);
1180 __get_cpu_var(current_kprobe) = NULL;
1181 }
1182 preempt_enable_no_resched();
1183}
1184
1185static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1186{
1187 int len = 0, ret;
1188
1189 while (len < RELATIVEJUMP_SIZE) {
1190 ret = __copy_instruction(dest + len, src + len, 1);
1191 if (!ret || !can_boost(dest + len))
1192 return -EINVAL;
1193 len += ret;
1194 }
1195 /* Check whether the address range is reserved */
1196 if (ftrace_text_reserved(src, src + len - 1) ||
1197 alternatives_text_reserved(src, src + len - 1))
1198 return -EBUSY;
1199
1200 return len;
1201}
1202
1203/* Check whether insn is indirect jump */
1204static int __kprobes insn_is_indirect_jump(struct insn *insn)
1205{
1206 return ((insn->opcode.bytes[0] == 0xff &&
1207 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1208 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1209}
1210
1211/* Check whether insn jumps into specified address range */
1212static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1213{
1214 unsigned long target = 0;
1215
1216 switch (insn->opcode.bytes[0]) {
1217 case 0xe0: /* loopne */
1218 case 0xe1: /* loope */
1219 case 0xe2: /* loop */
1220 case 0xe3: /* jcxz */
1221 case 0xe9: /* near relative jump */
1222 case 0xeb: /* short relative jump */
1223 break;
1224 case 0x0f:
1225 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1226 break;
1227 return 0;
1228 default:
1229 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1230 break;
1231 return 0;
1232 }
1233 target = (unsigned long)insn->next_byte + insn->immediate.value;
1234
1235 return (start <= target && target <= start + len);
1236}
1237
1238/* Decode whole function to ensure any instructions don't jump into target */
1239static int __kprobes can_optimize(unsigned long paddr)
1240{
1241 int ret;
1242 unsigned long addr, size = 0, offset = 0;
1243 struct insn insn;
1244 kprobe_opcode_t buf[MAX_INSN_SIZE];
1245 /* Dummy buffers for lookup_symbol_attrs */
1246 static char __dummy_buf[KSYM_NAME_LEN];
1247
1248 /* Lookup symbol including addr */
1249 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1250 return 0;
1251
1252 /* Check there is enough space for a relative jump. */
1253 if (size - offset < RELATIVEJUMP_SIZE)
1254 return 0;
1255
1256 /* Decode instructions */
1257 addr = paddr - offset;
1258 while (addr < paddr - offset + size) { /* Decode until function end */
1259 if (search_exception_tables(addr))
1260 /*
1261 * Since some fixup code will jumps into this function,
1262 * we can't optimize kprobe in this function.
1263 */
1264 return 0;
1265 kernel_insn_init(&insn, (void *)addr);
1266 insn_get_opcode(&insn);
1267 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1268 ret = recover_probed_instruction(buf, addr);
1269 if (ret)
1270 return 0;
1271 kernel_insn_init(&insn, buf);
1272 }
1273 insn_get_length(&insn);
1274 /* Recover address */
1275 insn.kaddr = (void *)addr;
1276 insn.next_byte = (void *)(addr + insn.length);
1277 /* Check any instructions don't jump into target */
1278 if (insn_is_indirect_jump(&insn) ||
1279 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1280 RELATIVE_ADDR_SIZE))
1281 return 0;
1282 addr += insn.length;
1283 }
1284
1285 return 1;
1286}
1287
1288/* Check optimized_kprobe can actually be optimized. */
1289int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1290{
1291 int i;
1292 struct kprobe *p;
1293
1294 for (i = 1; i < op->optinsn.size; i++) {
1295 p = get_kprobe(op->kp.addr + i);
1296 if (p && !kprobe_disabled(p))
1297 return -EEXIST;
1298 }
1299
1300 return 0;
1301}
1302
1303/* Check the addr is within the optimized instructions. */
1304int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1305 unsigned long addr)
1306{
1307 return ((unsigned long)op->kp.addr <= addr &&
1308 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1309}
1310
1311/* Free optimized instruction slot */
1312static __kprobes
1313void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1314{
1315 if (op->optinsn.insn) {
1316 free_optinsn_slot(op->optinsn.insn, dirty);
1317 op->optinsn.insn = NULL;
1318 op->optinsn.size = 0;
1319 }
1320}
1321
1322void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1323{
1324 __arch_remove_optimized_kprobe(op, 1);
1325}
1326
1327/*
1328 * Copy replacing target instructions
1329 * Target instructions MUST be relocatable (checked inside)
1330 */
1331int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1332{
1333 u8 *buf;
1334 int ret;
1335 long rel;
1336
1337 if (!can_optimize((unsigned long)op->kp.addr))
1338 return -EILSEQ;
1339
1340 op->optinsn.insn = get_optinsn_slot();
1341 if (!op->optinsn.insn)
1342 return -ENOMEM;
1343
1344 /*
1345 * Verify if the address gap is in 2GB range, because this uses
1346 * a relative jump.
1347 */
1348 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1349 if (abs(rel) > 0x7fffffff)
1350 return -ERANGE;
1351
1352 buf = (u8 *)op->optinsn.insn;
1353
1354 /* Copy instructions into the out-of-line buffer */
1355 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1356 if (ret < 0) {
1357 __arch_remove_optimized_kprobe(op, 0);
1358 return ret;
1359 }
1360 op->optinsn.size = ret;
1361
1362 /* Copy arch-dep-instance from template */
1363 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1364
1365 /* Set probe information */
1366 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1367
1368 /* Set probe function call */
1369 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1370
1371 /* Set returning jmp instruction at the tail of out-of-line buffer */
1372 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1373 (u8 *)op->kp.addr + op->optinsn.size);
1374
1375 flush_icache_range((unsigned long) buf,
1376 (unsigned long) buf + TMPL_END_IDX +
1377 op->optinsn.size + RELATIVEJUMP_SIZE);
1378 return 0;
1379}
1380
1381/* Replace a breakpoint (int3) with a relative jump. */
1382int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1383{
1384 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1385 s32 rel = (s32)((long)op->optinsn.insn -
1386 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1387
1388 /* Backup instructions which will be replaced by jump address */
1389 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1390 RELATIVE_ADDR_SIZE);
1391
1392 jmp_code[0] = RELATIVEJUMP_OPCODE;
1393 *(s32 *)(&jmp_code[1]) = rel;
1394
1395 /*
1396 * text_poke_smp doesn't support NMI/MCE code modifying.
1397 * However, since kprobes itself also doesn't support NMI/MCE
1398 * code probing, it's not a problem.
1399 */
1400 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1401 return 0;
1402}
1403
1404/* Replace a relative jump with a breakpoint (int3). */
1405void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1406{
1407 u8 buf[RELATIVEJUMP_SIZE];
1408
1409 /* Set int3 to first byte for kprobes */
1410 buf[0] = BREAKPOINT_INSTRUCTION;
1411 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1412 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1413}
1414
1415static int __kprobes setup_detour_execution(struct kprobe *p,
1416 struct pt_regs *regs,
1417 int reenter)
1418{
1419 struct optimized_kprobe *op;
1420
1421 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1422 /* This kprobe is really able to run optimized path. */
1423 op = container_of(p, struct optimized_kprobe, kp);
1424 /* Detour through copied instructions */
1425 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1426 if (!reenter)
1427 reset_current_kprobe();
1428 preempt_enable_no_resched();
1429 return 1;
1430 }
1431 return 0;
1432}
1433#endif
1434
1036int __init arch_init_kprobes(void) 1435int __init arch_init_kprobes(void)
1037{ 1436{
1038 return 0; 1437 return 0;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60cbd1..ea697263b37 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/errno.h> 9#include <linux/errno.h>
10#include <linux/gfp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/string.h> 12#include <linux/string.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4a8bb82248a..035c8c52918 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/gfp.h>
12#include <linux/reboot.h> 13#include <linux/reboot.h>
13#include <linux/numa.h> 14#include <linux/numa.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 845d80ce1ef..63eaf659623 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -42,6 +42,7 @@
42#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/mca.h> 43#include <linux/mca.h>
44#include <linux/kprobes.h> 44#include <linux/kprobes.h>
45#include <linux/slab.h>
45#include <asm/system.h> 46#include <asm/system.h>
46#include <asm/io.h> 47#include <asm/io.h>
47#include <linux/proc_fs.h> 48#include <linux/proc_fs.h>
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc41..71825806cd4 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/dmi.h> 9#include <linux/dmi.h>
10#include <linux/range.h>
11
10#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
11#include <linux/sort.h> 13#include <linux/sort.h>
12#include <asm/io.h> 14#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
30 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 32 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
31}; 33};
32 34
33struct range {
34 u64 start;
35 u64 end;
36};
37
38static int __cpuinit cmp_range(const void *x1, const void *x2) 35static int __cpuinit cmp_range(const void *x1, const void *x2)
39{ 36{
40 const struct range *r1 = x1; 37 const struct range *r1 = x1;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 89f386f044e..e0bc186d750 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -23,6 +23,7 @@
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h>
26 27
27#include <asm/system.h> 28#include <asm/system.h>
28#include <asm/page.h> 29#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index a2c1edd2d3a..e81030f71a8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -664,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
664{ 664{
665 unsigned long size = get_mpc_size(mpf->physptr); 665 unsigned long size = get_mpc_size(mpf->physptr);
666 666
667 reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); 667 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
668} 668}
669 669
670static int __init smp_scan_config(unsigned long base, unsigned long length) 670static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -693,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
693 mpf, (u64)virt_to_phys(mpf)); 693 mpf, (u64)virt_to_phys(mpf));
694 694
695 mem = virt_to_phys(mpf); 695 mem = virt_to_phys(mpf);
696 reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); 696 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
697 if (mpf->physptr) 697 if (mpf->physptr)
698 smp_reserve_memory(mpf); 698 smp_reserve_memory(mpf);
699 699
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc87..0aad8670858 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
13 17
14#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
30int sfi_mtimer_num;
31
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
33EXPORT_SYMBOL_GPL(sfi_mrtc_array);
34int sfi_mrtc_num;
35
36static inline void assign_to_mp_irq(struct mpc_intsrc *m,
37 struct mpc_intsrc *mp_irq)
38{
39 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
40}
41
42static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
43 struct mpc_intsrc *m)
44{
45 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
46}
47
48static void save_mp_irq(struct mpc_intsrc *m)
49{
50 int i;
51
52 for (i = 0; i < mp_irq_entries; i++) {
53 if (!mp_irq_cmp(&mp_irqs[i], m))
54 return;
55 }
56
57 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
58 if (++mp_irq_entries == MAX_IRQ_SOURCES)
59 panic("Max # of irq sources exceeded!!\n");
60}
61
62/* parse all the mtimer info to a static mtimer array */
63static int __init sfi_parse_mtmr(struct sfi_table_header *table)
64{
65 struct sfi_table_simple *sb;
66 struct sfi_timer_table_entry *pentry;
67 struct mpc_intsrc mp_irq;
68 int totallen;
69
70 sb = (struct sfi_table_simple *)table;
71 if (!sfi_mtimer_num) {
72 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
73 struct sfi_timer_table_entry);
74 pentry = (struct sfi_timer_table_entry *) sb->pentry;
75 totallen = sfi_mtimer_num * sizeof(*pentry);
76 memcpy(sfi_mtimer_array, pentry, totallen);
77 }
78
79 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
80 pentry = sfi_mtimer_array;
81 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
82 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
83 " irq = %d\n", totallen, (u32)pentry->phys_addr,
84 pentry->freq_hz, pentry->irq);
85 if (!pentry->irq)
86 continue;
87 mp_irq.type = MP_IOAPIC;
88 mp_irq.irqtype = mp_INT;
89/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
90 mp_irq.irqflag = 5;
91 mp_irq.srcbus = 0;
92 mp_irq.srcbusirq = pentry->irq; /* IRQ */
93 mp_irq.dstapic = MP_APIC_ALL;
94 mp_irq.dstirq = pentry->irq;
95 save_mp_irq(&mp_irq);
96 }
97
98 return 0;
99}
100
101struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
102{
103 int i;
104 if (hint < sfi_mtimer_num) {
105 if (!sfi_mtimer_usage[hint]) {
106 pr_debug("hint taken for timer %d irq %d\n",\
107 hint, sfi_mtimer_array[hint].irq);
108 sfi_mtimer_usage[hint] = 1;
109 return &sfi_mtimer_array[hint];
110 }
111 }
112 /* take the first timer available */
113 for (i = 0; i < sfi_mtimer_num;) {
114 if (!sfi_mtimer_usage[i]) {
115 sfi_mtimer_usage[i] = 1;
116 return &sfi_mtimer_array[i];
117 }
118 i++;
119 }
120 return NULL;
121}
122
123void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
124{
125 int i;
126 for (i = 0; i < sfi_mtimer_num;) {
127 if (mtmr->irq == sfi_mtimer_array[i].irq) {
128 sfi_mtimer_usage[i] = 0;
129 return;
130 }
131 i++;
132 }
133}
134
135/* parse all the mrtc info to a global mrtc array */
136int __init sfi_parse_mrtc(struct sfi_table_header *table)
137{
138 struct sfi_table_simple *sb;
139 struct sfi_rtc_table_entry *pentry;
140 struct mpc_intsrc mp_irq;
141
142 int totallen;
143
144 sb = (struct sfi_table_simple *)table;
145 if (!sfi_mrtc_num) {
146 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
147 struct sfi_rtc_table_entry);
148 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
149 totallen = sfi_mrtc_num * sizeof(*pentry);
150 memcpy(sfi_mrtc_array, pentry, totallen);
151 }
152
153 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
154 pentry = sfi_mrtc_array;
155 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
156 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
157 totallen, (u32)pentry->phys_addr, pentry->irq);
158 mp_irq.type = MP_IOAPIC;
159 mp_irq.irqtype = mp_INT;
160 mp_irq.irqflag = 0;
161 mp_irq.srcbus = 0;
162 mp_irq.srcbusirq = pentry->irq; /* IRQ */
163 mp_irq.dstapic = MP_APIC_ALL;
164 mp_irq.dstirq = pentry->irq;
165 save_mp_irq(&mp_irq);
166 }
167 return 0;
168}
169
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void)
183{
184 unsigned long flags, fast_calibrate;
185
186 local_irq_save(flags);
187 fast_calibrate = apbt_quick_calibrate();
188 local_irq_restore(flags);
189
190 if (fast_calibrate)
191 return fast_calibrate;
192
193 return 0;
194}
195
196void __init mrst_time_init(void)
197{
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0();
200 apbt_time_init();
201}
202
203void __init mrst_rtc_init(void)
204{
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206}
207
208/*
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
215 if (disable_apbt_percpu)
216 setup_boot_APIC_clock();
217};
15 218
16/* 219/*
17 * Moorestown specific x86_init function overrides and early setup 220 * Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
21{ 224{
22 x86_init.resources.probe_roms = x86_init_noop; 225 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop; 226 x86_init.resources.reserve_resources = x86_init_noop;
227
228 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
230
231 x86_init.irqs.pre_vector_init = x86_init_noop;
232
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
234
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
236 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop;
238
239 legacy_pic = &null_legacy_pic;
24} 240}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 206735ac8cb..4d4468e9f47 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -37,6 +37,7 @@
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/gfp.h>
40 41
41#include <asm/processor.h> 42#include <asm/processor.h>
42#include <asm/msr.h> 43#include <asm/msr.h>
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 9d1d263f786..8297160c41b 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20
20#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h>
21#include <asm/olpc.h> 23#include <asm/olpc.h>
22 24
23#ifdef CONFIG_OPEN_FIRMWARE 25#ifdef CONFIG_OPEN_FIRMWARE
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
243 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, 245 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
244 (unsigned char *) &olpc_platform_info.ecver, 1); 246 (unsigned char *) &olpc_platform_info.ecver, 1);
245 247
246 /* check to see if the VSA exists */ 248#ifdef CONFIG_PCI_OLPC
247 if (cs5535_has_vsa2()) 249 /* If the VSA exists let it emulate PCI, if not emulate in kernel */
248 olpc_platform_info.flags |= OLPC_F_VSA; 250 if (!cs5535_has_vsa2())
251 x86_init.pci.arch_init = pci_olpc_init;
252#endif
249 253
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 254 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
251 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", 255 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d1631..1db183ed7c0 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 428 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 429 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
430 430
431#ifdef CONFIG_HIGHPTE
432 .kmap_atomic_pte = kmap_atomic,
433#endif
434
435#if PAGETABLE_LEVELS >= 3 431#if PAGETABLE_LEVELS >= 3
436#ifdef CONFIG_X86_PAE 432#ifdef CONFIG_X86_PAE
437 .set_pte_atomic = native_set_pte_atomic, 433 .set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 2bbde607814..fb99f7edb34 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1309/* 1309/*
1310 * get_tce_space_from_tar(): 1310 * get_tce_space_from_tar():
1311 * Function for kdump case. Get the tce tables from first kernel 1311 * Function for kdump case. Get the tce tables from first kernel
1312 * by reading the contents of the base adress register of calgary iommu 1312 * by reading the contents of the base address register of calgary iommu
1313 */ 1313 */
1314static void __init get_tce_space_from_tar(void) 1314static void __init get_tce_space_from_tar(void)
1315{ 1315{
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 75e14e21f61..4b7e3d8b01d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -2,6 +2,7 @@
2#include <linux/dma-debug.h> 2#include <linux/dma-debug.h>
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/gfp.h>
5#include <linux/pci.h> 6#include <linux/pci.h>
6#include <linux/kmemleak.h> 7#include <linux/kmemleak.h>
7 8
@@ -38,7 +39,7 @@ int iommu_detected __read_mostly = 0;
38 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 39 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
39 * If this variable is 1, IOMMU implementations do no DMA translation for 40 * If this variable is 1, IOMMU implementations do no DMA translation for
40 * devices and allow every device to access to whole physical memory. This is 41 * devices and allow every device to access to whole physical memory. This is
41 * useful if a user want to use an IOMMU only for KVM device assignment to 42 * useful if a user wants to use an IOMMU only for KVM device assignment to
42 * guests and not for driver dma translation. 43 * guests and not for driver dma translation.
43 */ 44 */
44int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
@@ -65,7 +66,7 @@ int dma_set_mask(struct device *dev, u64 mask)
65} 66}
66EXPORT_SYMBOL(dma_set_mask); 67EXPORT_SYMBOL(dma_set_mask);
67 68
68#ifdef CONFIG_X86_64 69#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
69static __initdata void *dma32_bootmem_ptr; 70static __initdata void *dma32_bootmem_ptr;
70static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); 71static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
71 72
@@ -116,14 +117,21 @@ static void __init dma32_free_bootmem(void)
116 dma32_bootmem_ptr = NULL; 117 dma32_bootmem_ptr = NULL;
117 dma32_bootmem_size = 0; 118 dma32_bootmem_size = 0;
118} 119}
120#else
121void __init dma32_reserve_bootmem(void)
122{
123}
124static void __init dma32_free_bootmem(void)
125{
126}
127
119#endif 128#endif
120 129
121void __init pci_iommu_alloc(void) 130void __init pci_iommu_alloc(void)
122{ 131{
123#ifdef CONFIG_X86_64
124 /* free the range so iommu could get some range less than 4G */ 132 /* free the range so iommu could get some range less than 4G */
125 dma32_free_bootmem(); 133 dma32_free_bootmem();
126#endif 134
127 if (pci_swiotlb_detect()) 135 if (pci_swiotlb_detect())
128 goto out; 136 goto out;
129 137
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 34de53b46f8..0f7f130caa6 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -29,6 +29,7 @@
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/sysdev.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h>
32#include <asm/atomic.h> 33#include <asm/atomic.h>
33#include <asm/mtrr.h> 34#include <asm/mtrr.h>
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
@@ -564,6 +565,9 @@ static void enable_gart_translations(void)
564 565
565 enable_gart_translation(dev, __pa(agp_gatt_table)); 566 enable_gart_translation(dev, __pa(agp_gatt_table));
566 } 567 }
568
569 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts();
567} 571}
568 572
569/* 573/*
@@ -735,7 +739,7 @@ int __init gart_iommu_init(void)
735 unsigned long scratch; 739 unsigned long scratch;
736 long i; 740 long i;
737 741
738 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 742 if (num_k8_northbridges == 0)
739 return 0; 743 return 0;
740 744
741#ifndef CONFIG_AGP_AMD64 745#ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 22be12b60a8..3af4af810c0 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -4,6 +4,7 @@
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/gfp.h>
7#include <linux/pci.h> 8#include <linux/pci.h>
8#include <linux/mm.h> 9#include <linux/mm.h>
9 10
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 02d678065d7..28ad9f4d8b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -526,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
526} 526}
527 527
528/* 528/*
529 * Check for AMD CPUs, which have potentially C1E support 529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
530 */ 533 */
531static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) 534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
532{ 535{
536 u64 val;
533 if (c->x86_vendor != X86_VENDOR_AMD) 537 if (c->x86_vendor != X86_VENDOR_AMD)
534 return 0; 538 goto no_c1e_idle;
535
536 if (c->x86 < 0x0F)
537 return 0;
538 539
539 /* Family 0x0f models < rev F do not have C1E */ 540 /* Family 0x0f models < rev F do not have C1E */
540 if (c->x86 == 0x0f && c->x86_model < 0x40) 541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
541 return 0; 542 return 1;
542 543
543 return 1; 544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
550 if (val >= 2) {
551 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
552 if (!(val & BIT(1)))
553 goto no_c1e_idle;
554 }
555 return 1;
556 }
557
558no_c1e_idle:
559 return 0;
544} 560}
545 561
546static cpumask_var_t c1e_mask; 562static cpumask_var_t c1e_mask;
@@ -607,7 +623,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
607{ 623{
608#ifdef CONFIG_SMP 624#ifdef CONFIG_SMP
609 if (pm_idle == poll_idle && smp_num_siblings > 1) { 625 if (pm_idle == poll_idle && smp_num_siblings > 1) {
610 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 626 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
611 " performance may degrade.\n"); 627 " performance may degrade.\n");
612 } 628 }
613#endif 629#endif
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dc9690b4c4c..17cb3295cbf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -276,12 +276,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
276 276
277 set_tsk_thread_flag(p, TIF_FORK); 277 set_tsk_thread_flag(p, TIF_FORK);
278 278
279 p->thread.fs = me->thread.fs;
280 p->thread.gs = me->thread.gs;
281 p->thread.io_bitmap_ptr = NULL; 279 p->thread.io_bitmap_ptr = NULL;
282 280
283 savesegment(gs, p->thread.gsindex); 281 savesegment(gs, p->thread.gsindex);
282 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
284 savesegment(fs, p->thread.fsindex); 283 savesegment(fs, p->thread.fsindex);
284 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
285 savesegment(es, p->thread.es); 285 savesegment(es, p->thread.es);
286 savesegment(ds, p->thread.ds); 286 savesegment(ds, p->thread.ds);
287 287
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2d96aab82a4..2e9b55027b7 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
15#include <linux/ptrace.h> 16#include <linux/ptrace.h>
16#include <linux/regset.h> 17#include <linux/regset.h>
17#include <linux/tracehook.h> 18#include <linux/tracehook.h>
@@ -581,7 +582,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
581 struct perf_event_attr attr; 582 struct perf_event_attr attr;
582 583
583 /* 584 /*
584 * We shoud have at least an inactive breakpoint at this 585 * We should have at least an inactive breakpoint at this
585 * slot. It means the user is writing dr7 without having 586 * slot. It means the user is writing dr7 without having
586 * written the address register first 587 * written the address register first
587 */ 588 */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 704bddcdf64..8e1aac86b50 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -461,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), 461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
462 }, 462 },
463 }, 463 },
464 { /* Handle problems with rebooting on the iMac9,1. */
465 .callback = set_pci_reboot,
466 .ident = "Apple iMac9,1",
467 .matches = {
468 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
469 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
470 },
471 },
464 { } 472 { }
465}; 473};
466 474
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb42109a55b..c4851eff57b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -55,7 +55,6 @@
55#include <linux/stddef.h> 55#include <linux/stddef.h>
56#include <linux/unistd.h> 56#include <linux/unistd.h>
57#include <linux/ptrace.h> 57#include <linux/ptrace.h>
58#include <linux/slab.h>
59#include <linux/user.h> 58#include <linux/user.h>
60#include <linux/delay.h> 59#include <linux/delay.h>
61 60
@@ -314,16 +313,17 @@ static void __init reserve_brk(void)
314#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 313#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
315static void __init relocate_initrd(void) 314static void __init relocate_initrd(void)
316{ 315{
317 316 /* Assume only end is not page aligned */
318 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 317 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
319 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 318 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
319 u64 area_size = PAGE_ALIGN(ramdisk_size);
320 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 320 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
321 u64 ramdisk_here; 321 u64 ramdisk_here;
322 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
323 char *p, *q; 323 char *p, *q;
324 324
325 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into lowmem */
326 ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, 326 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
327 PAGE_SIZE); 327 PAGE_SIZE);
328 328
329 if (ramdisk_here == -1ULL) 329 if (ramdisk_here == -1ULL)
@@ -332,7 +332,7 @@ static void __init relocate_initrd(void)
332 332
333 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the lowmem currently occupied by
334 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
335 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, 335 reserve_early(ramdisk_here, ramdisk_here + area_size,
336 "NEW RAMDISK"); 336 "NEW RAMDISK");
337 initrd_start = ramdisk_here + PAGE_OFFSET; 337 initrd_start = ramdisk_here + PAGE_OFFSET;
338 initrd_end = initrd_start + ramdisk_size; 338 initrd_end = initrd_start + ramdisk_size;
@@ -376,9 +376,10 @@ static void __init relocate_initrd(void)
376 376
377static void __init reserve_initrd(void) 377static void __init reserve_initrd(void)
378{ 378{
379 /* Assume only end is not page aligned */
379 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 380 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
380 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 381 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
381 u64 ramdisk_end = ramdisk_image + ramdisk_size; 382 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
382 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 383 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
383 384
384 if (!boot_params.hdr.type_of_loader || 385 if (!boot_params.hdr.type_of_loader ||
@@ -606,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg)
606early_param("elfcorehdr", setup_elfcorehdr); 607early_param("elfcorehdr", setup_elfcorehdr);
607#endif 608#endif
608 609
610static __init void reserve_ibft_region(void)
611{
612 unsigned long addr, size = 0;
613
614 addr = find_ibft_region(&size);
615
616 if (size)
617 reserve_early_overlap_ok(addr, addr + size, "ibft");
618}
619
609#ifdef CONFIG_X86_RESERVE_LOW_64K 620#ifdef CONFIG_X86_RESERVE_LOW_64K
610static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 621static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
611{ 622{
@@ -908,6 +919,8 @@ void __init setup_arch(char **cmdline_p)
908 */ 919 */
909 find_smp_config(); 920 find_smp_config();
910 921
922 reserve_ibft_region();
923
911 reserve_trampoline_memory(); 924 reserve_trampoline_memory();
912 925
913#ifdef CONFIG_ACPI_SLEEP 926#ifdef CONFIG_ACPI_SLEEP
@@ -969,17 +982,11 @@ void __init setup_arch(char **cmdline_p)
969#endif 982#endif
970 983
971 initmem_init(0, max_pfn, acpi, k8); 984 initmem_init(0, max_pfn, acpi, k8);
972 985#ifndef CONFIG_NO_BOOTMEM
973#ifdef CONFIG_X86_64 986 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
974 /*
975 * dma32_reserve_bootmem() allocates bootmem which may conflict
976 * with the crashkernel command line, so do that after
977 * reserve_crashkernel()
978 */
979 dma32_reserve_bootmem();
980#endif 987#endif
981 988
982 reserve_ibft_region(); 989 dma32_reserve_bootmem();
983 990
984#ifdef CONFIG_KVM_CLOCK 991#ifdef CONFIG_KVM_CLOCK
985 kvmclock_init(); 992 kvmclock_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e..ef6370b00e7 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
137 137
138static void __init pcpu_fc_free(void *ptr, size_t size) 138static void __init pcpu_fc_free(void *ptr, size_t size)
139{ 139{
140#ifdef CONFIG_NO_BOOTMEM
141 u64 start = __pa(ptr);
142 u64 end = start + size;
143 free_early_partial(start, end);
144#else
140 free_bootmem(__pa(ptr), size); 145 free_bootmem(__pa(ptr), size);
146#endif
141} 147}
142 148
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 149static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index ec1de97600e..d801210945d 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
21#include <linux/cache.h> 21#include <linux/cache.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/gfp.h>
24 25
25#include <asm/mtrr.h> 26#include <asm/mtrr.h>
26#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9b4401115ea..763d815e27a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,8 @@
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h> 50#include <linux/tboot.h>
51#include <linux/stackprotector.h>
52#include <linux/gfp.h>
51 53
52#include <asm/acpi.h> 54#include <asm/acpi.h>
53#include <asm/desc.h> 55#include <asm/desc.h>
@@ -67,6 +69,7 @@
67#include <linux/mc146818rtc.h> 69#include <linux/mc146818rtc.h>
68 70
69#include <asm/smpboot_hooks.h> 71#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h>
70 73
71#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
72u8 apicid_2_node[MAX_APICID]; 75u8 apicid_2_node[MAX_APICID];
@@ -240,7 +243,10 @@ static void __cpuinit smp_callin(void)
240 end_local_APIC_setup(); 243 end_local_APIC_setup();
241 map_cpu_to_logical_apicid(); 244 map_cpu_to_logical_apicid();
242 245
243 notify_cpu_starting(cpuid); 246 /*
247 * Need to setup vector mappings before we enable interrupts.
248 */
249 setup_vector_irq(smp_processor_id());
244 /* 250 /*
245 * Get our bogomips. 251 * Get our bogomips.
246 * 252 *
@@ -257,6 +263,8 @@ static void __cpuinit smp_callin(void)
257 */ 263 */
258 smp_store_cpu_info(cpuid); 264 smp_store_cpu_info(cpuid);
259 265
266 notify_cpu_starting(cpuid);
267
260 /* 268 /*
261 * Allow the master to continue. 269 * Allow the master to continue.
262 */ 270 */
@@ -286,9 +294,9 @@ notrace static void __cpuinit start_secondary(void *unused)
286 check_tsc_sync_target(); 294 check_tsc_sync_target();
287 295
288 if (nmi_watchdog == NMI_IO_APIC) { 296 if (nmi_watchdog == NMI_IO_APIC) {
289 disable_8259A_irq(0); 297 legacy_pic->chip->mask(0);
290 enable_NMI_through_LVT0(); 298 enable_NMI_through_LVT0();
291 enable_8259A_irq(0); 299 legacy_pic->chip->unmask(0);
292 } 300 }
293 301
294#ifdef CONFIG_X86_32 302#ifdef CONFIG_X86_32
@@ -315,7 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
315 */ 323 */
316 ipi_call_lock(); 324 ipi_call_lock();
317 lock_vector_lock(); 325 lock_vector_lock();
318 __setup_vector_irq(smp_processor_id());
319 set_cpu_online(smp_processor_id(), true); 326 set_cpu_online(smp_processor_id(), true);
320 unlock_vector_lock(); 327 unlock_vector_lock();
321 ipi_call_unlock(); 328 ipi_call_unlock();
@@ -325,6 +332,9 @@ notrace static void __cpuinit start_secondary(void *unused)
325 /* enable local interrupts */ 332 /* enable local interrupts */
326 local_irq_enable(); 333 local_irq_enable();
327 334
335 /* to prevent fake stack check failure in clock setup */
336 boot_init_stack_canary();
337
328 x86_cpuinit.setup_percpu_clockev(); 338 x86_cpuinit.setup_percpu_clockev();
329 339
330 wmb(); 340 wmb();
@@ -1212,11 +1222,12 @@ __init void prefill_possible_map(void)
1212 1222
1213 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1214 1224
1215 if (possible > CONFIG_NR_CPUS) { 1225 /* nr_cpu_ids could be reduced via nr_cpus= */
1226 if (possible > nr_cpu_ids) {
1216 printk(KERN_WARNING 1227 printk(KERN_WARNING
1217 "%d Processors exceeds NR_CPUS limit of %d\n", 1228 "%d Processors exceeds NR_CPUS limit of %d\n",
1218 possible, CONFIG_NR_CPUS); 1229 possible, nr_cpu_ids);
1219 possible = CONFIG_NR_CPUS; 1230 possible = nr_cpu_ids;
1220 } 1231 }
1221 1232
1222 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index dee1ff7cba5..196552bb412 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -25,191 +25,6 @@
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27/* 27/*
28 * Perform the select(nd, in, out, ex, tv) and mmap() system
29 * calls. Linux/i386 didn't use to be able to handle more than
30 * 4 system call parameters, so these system calls used a memory
31 * block for parameter passing..
32 */
33
34struct mmap_arg_struct {
35 unsigned long addr;
36 unsigned long len;
37 unsigned long prot;
38 unsigned long flags;
39 unsigned long fd;
40 unsigned long offset;
41};
42
43asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
44{
45 struct mmap_arg_struct a;
46 int err = -EFAULT;
47
48 if (copy_from_user(&a, arg, sizeof(a)))
49 goto out;
50
51 err = -EINVAL;
52 if (a.offset & ~PAGE_MASK)
53 goto out;
54
55 err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags,
56 a.fd, a.offset >> PAGE_SHIFT);
57out:
58 return err;
59}
60
61
62struct sel_arg_struct {
63 unsigned long n;
64 fd_set __user *inp, *outp, *exp;
65 struct timeval __user *tvp;
66};
67
68asmlinkage int old_select(struct sel_arg_struct __user *arg)
69{
70 struct sel_arg_struct a;
71
72 if (copy_from_user(&a, arg, sizeof(a)))
73 return -EFAULT;
74 /* sys_select() does the appropriate kernel locking */
75 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
76}
77
78/*
79 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
80 *
81 * This is really horribly ugly.
82 */
83asmlinkage int sys_ipc(uint call, int first, int second,
84 int third, void __user *ptr, long fifth)
85{
86 int version, ret;
87
88 version = call >> 16; /* hack for backward compatibility */
89 call &= 0xffff;
90
91 switch (call) {
92 case SEMOP:
93 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
94 case SEMTIMEDOP:
95 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
96 (const struct timespec __user *)fifth);
97
98 case SEMGET:
99 return sys_semget(first, second, third);
100 case SEMCTL: {
101 union semun fourth;
102 if (!ptr)
103 return -EINVAL;
104 if (get_user(fourth.__pad, (void __user * __user *) ptr))
105 return -EFAULT;
106 return sys_semctl(first, second, third, fourth);
107 }
108
109 case MSGSND:
110 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
111 second, third);
112 case MSGRCV:
113 switch (version) {
114 case 0: {
115 struct ipc_kludge tmp;
116 if (!ptr)
117 return -EINVAL;
118
119 if (copy_from_user(&tmp,
120 (struct ipc_kludge __user *) ptr,
121 sizeof(tmp)))
122 return -EFAULT;
123 return sys_msgrcv(first, tmp.msgp, second,
124 tmp.msgtyp, third);
125 }
126 default:
127 return sys_msgrcv(first,
128 (struct msgbuf __user *) ptr,
129 second, fifth, third);
130 }
131 case MSGGET:
132 return sys_msgget((key_t) first, second);
133 case MSGCTL:
134 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
135
136 case SHMAT:
137 switch (version) {
138 default: {
139 ulong raddr;
140 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
141 if (ret)
142 return ret;
143 return put_user(raddr, (ulong __user *) third);
144 }
145 case 1: /* iBCS2 emulator entry point */
146 if (!segment_eq(get_fs(), get_ds()))
147 return -EINVAL;
148 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
149 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
150 }
151 case SHMDT:
152 return sys_shmdt((char __user *)ptr);
153 case SHMGET:
154 return sys_shmget(first, second, third);
155 case SHMCTL:
156 return sys_shmctl(first, second,
157 (struct shmid_ds __user *) ptr);
158 default:
159 return -ENOSYS;
160 }
161}
162
163/*
164 * Old cruft
165 */
166asmlinkage int sys_uname(struct old_utsname __user *name)
167{
168 int err;
169 if (!name)
170 return -EFAULT;
171 down_read(&uts_sem);
172 err = copy_to_user(name, utsname(), sizeof(*name));
173 up_read(&uts_sem);
174 return err? -EFAULT:0;
175}
176
177asmlinkage int sys_olduname(struct oldold_utsname __user *name)
178{
179 int error;
180
181 if (!name)
182 return -EFAULT;
183 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
184 return -EFAULT;
185
186 down_read(&uts_sem);
187
188 error = __copy_to_user(&name->sysname, &utsname()->sysname,
189 __OLD_UTS_LEN);
190 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
191 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
192 __OLD_UTS_LEN);
193 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
194 error |= __copy_to_user(&name->release, &utsname()->release,
195 __OLD_UTS_LEN);
196 error |= __put_user(0, name->release + __OLD_UTS_LEN);
197 error |= __copy_to_user(&name->version, &utsname()->version,
198 __OLD_UTS_LEN);
199 error |= __put_user(0, name->version + __OLD_UTS_LEN);
200 error |= __copy_to_user(&name->machine, &utsname()->machine,
201 __OLD_UTS_LEN);
202 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
203
204 up_read(&uts_sem);
205
206 error = error ? -EFAULT : 0;
207
208 return error;
209}
210
211
212/*
213 * Do a system call from kernel instead of calling sys_execve so we 28 * Do a system call from kernel instead of calling sys_execve so we
214 * end up with proper pt_regs. 29 * end up with proper pt_regs.
215 */ 30 */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 8aa2057efd1..ff14a5044ce 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -209,15 +209,3 @@ bottomup:
209 209
210 return addr; 210 return addr;
211} 211}
212
213
214SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
215{
216 int err;
217 down_read(&uts_sem);
218 err = copy_to_user(name, utsname(), sizeof(*name));
219 up_read(&uts_sem);
220 if (personality(current->personality) == PER_LINUX32)
221 err |= copy_to_user(&name->machine, "i686", 5);
222 return err ? -EFAULT : 0;
223}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 15228b5d3eb..8b372934121 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
81 .long sys_settimeofday 81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */ 82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16 83 .long sys_setgroups16
84 .long old_select 84 .long sys_old_select
85 .long sys_symlink 85 .long sys_symlink
86 .long sys_lstat 86 .long sys_lstat
87 .long sys_readlink /* 85 */ 87 .long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long sys_old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long sys_old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
95 .long sys_ftruncate 95 .long sys_ftruncate
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed..fb5cc5e14cf 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
70 * manually to deassert NMI lines for the watchdog if run 70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system. 71 * on an 82489DX-based system.
72 */ 72 */
73 spin_lock(&i8259A_lock); 73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3); 74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */ 75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL); 76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock); 77 raw_spin_unlock(&i8259A_lock);
78 } 78 }
79 79
80 global_clock_event->event_handler(global_clock_event); 80 global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 364d015efeb..17b03dd3a6b 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -9,6 +9,7 @@
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/slab.h>
12 13
13#include <asm/mmu_context.h> 14#include <asm/mmu_context.h>
14#include <asm/uv/uv.h> 15#include <asm/uv/uv.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 23066ecf12f..9faf91ae184 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
50 * unstable. We do this because unlike Time Of Day, 50 * unstable. We do this because unlike Time Of Day,
51 * the scheduler clock tolerates small errors and it's 51 * the scheduler clock tolerates small errors and it's
52 * very important for it to be as fast as the platform 52 * very important for it to be as fast as the platform
53 * can achive it. ) 53 * can achieve it. )
54 */ 54 */
55 if (unlikely(tsc_disabled)) { 55 if (unlikely(tsc_disabled)) {
56 /* No locking but a rare wrong value is not a big deal: */ 56 /* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
740} 740}
741#endif 741#endif
742 742
743static void resume_tsc(void) 743static void resume_tsc(struct clocksource *cs)
744{ 744{
745 clocksource_tsc.cycle_last = 0; 745 clocksource_tsc.cycle_last = 0;
746} 746}
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index ece73d8e324..1d40336b030 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/slab.h>
13#include <linux/irq.h> 14#include <linux/irq.h>
14 15
15#include <asm/apic.h> 16#include <asm/apic.h>
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 2b75ef638db..56e421bc379 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -19,6 +19,7 @@
19 * Copyright (c) Dimitri Sivanich 19 * Copyright (c) Dimitri Sivanich
20 */ 20 */
21#include <linux/clockchips.h> 21#include <linux/clockchips.h>
22#include <linux/slab.h>
22 23
23#include <asm/uv/uv_mmrs.h> 24#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 25#include <asm/uv/uv_hub.h>
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 34a279a7471..e680ea52db9 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
49char visws_board_type = -1; 49char visws_board_type = -1;
50char visws_board_rev = -1; 50char visws_board_rev = -1;
51 51
52int is_visws_box(void)
53{
54 return visws_board_type >= 0;
55}
56
57static void __init visws_time_init(void) 52static void __init visws_time_init(void)
58{ 53{
59 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init; 237 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init; 238 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init; 239 x86_init.timers.timer_init = visws_time_init;
240 x86_init.pci.init = pci_visws_init;
241 x86_init.pci.init_irq = x86_init_noop;
245 242
246 /* 243 /*
247 * Install reboot quirks: 244 * Install reboot quirks:
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
508 */ 505 */
509static unsigned int startup_piix4_master_irq(unsigned int irq) 506static unsigned int startup_piix4_master_irq(unsigned int irq)
510{ 507{
511 init_8259A(0); 508 legacy_pic->init(0);
512 509
513 return startup_cobalt_irq(irq); 510 return startup_cobalt_irq(irq);
514} 511}
@@ -532,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = {
532 529
533static struct irq_chip piix4_virtual_irq_type = { 530static struct irq_chip piix4_virtual_irq_type = {
534 .name = "PIIX4-virtual", 531 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq,
538}; 532};
539 533
540 534
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
559 struct irq_desc *desc; 553 struct irq_desc *desc;
560 unsigned long flags; 554 unsigned long flags;
561 555
562 spin_lock_irqsave(&i8259A_lock, flags); 556 raw_spin_lock_irqsave(&i8259A_lock, flags);
563 557
564 /* Find out what's interrupting in the PIIX4 master 8259 */ 558 /* Find out what's interrupting in the PIIX4 master 8259 */
565 outb(0x0c, 0x20); /* OCW3 Poll command */ 559 outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
596 outb(0x60 + realirq, 0x20); 590 outb(0x60 + realirq, 0x20);
597 } 591 }
598 592
599 spin_unlock_irqrestore(&i8259A_lock, flags); 593 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
600 594
601 desc = irq_to_desc(realirq); 595 desc = irq_to_desc(realirq);
602 596
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
609 handle_IRQ_event(realirq, desc->action); 603 handle_IRQ_event(realirq, desc->action);
610 604
611 if (!(desc->status & IRQ_DISABLED)) 605 if (!(desc->status & IRQ_DISABLED))
612 enable_8259A_irq(realirq); 606 legacy_pic->chip->unmask(realirq);
613 607
614 return IRQ_HANDLED; 608 return IRQ_HANDLED;
615 609
616out_unlock: 610out_unlock:
617 spin_unlock_irqrestore(&i8259A_lock, flags); 611 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
618 return IRQ_NONE; 612 return IRQ_NONE;
619} 613}
620 614
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
628 .name = "cascade", 622 .name = "cascade",
629}; 623};
630 624
625static inline void set_piix4_virtual_irq_type(void)
626{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask;
630}
631 631
632void init_VISWS_APIC_irqs(void) 632void init_VISWS_APIC_irqs(void)
633{ 633{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
653 desc->chip = &piix4_master_irq_type; 653 desc->chip = &piix4_master_irq_type;
654 } 654 }
655 else if (i < CO_IRQ_APIC0) { 655 else if (i < CO_IRQ_APIC0) {
656 set_piix4_virtual_irq_type();
656 desc->chip = &piix4_virtual_irq_type; 657 desc->chip = &piix4_virtual_irq_type;
657 } 658 }
658 else if (IS_CO_APIC(i)) { 659 else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c3019..ce9fbacb752 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -28,11 +28,13 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/gfp.h>
31#include <asm/vmi.h> 32#include <asm/vmi.h>
32#include <asm/io.h> 33#include <asm/io.h>
33#include <asm/fixmap.h> 34#include <asm/fixmap.h>
34#include <asm/apicdef.h> 35#include <asm/apicdef.h>
35#include <asm/apic.h> 36#include <asm/apic.h>
37#include <asm/pgalloc.h>
36#include <asm/processor.h> 38#include <asm/processor.h>
37#include <asm/timer.h> 39#include <asm/timer.h>
38#include <asm/vmi_time.h> 40#include <asm/vmi_time.h>
@@ -266,30 +268,6 @@ static void vmi_nop(void)
266{ 268{
267} 269}
268 270
269#ifdef CONFIG_HIGHPTE
270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
271{
272 void *va = kmap_atomic(page, type);
273
274 /*
275 * Internally, the VMI ROM must map virtual addresses to physical
276 * addresses for processing MMU updates. By the time MMU updates
277 * are issued, this information is typically already lost.
278 * Fortunately, the VMI provides a cache of mapping slots for active
279 * page tables.
280 *
281 * We use slot zero for the linear mapping of physical memory, and
282 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
283 *
284 * args: SLOT VA COUNT PFN
285 */
286 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
288
289 return va;
290}
291#endif
292
293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294{ 272{
295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +618,12 @@ static inline int __init activate_vmi(void)
640 u64 reloc; 618 u64 reloc;
641 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; 619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
642 620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
643 if (call_vrom_func(vmi_rom, vmi_init) != 0) { 627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
644 printk(KERN_ERR "VMI ROM failed to initialize!"); 628 printk(KERN_ERR "VMI ROM failed to initialize!");
645 return 0; 629 return 0;
@@ -778,10 +762,6 @@ static inline int __init activate_vmi(void)
778 762
779 /* Set linear is needed in all cases */ 763 /* Set linear is needed in all cases */
780 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
781#ifdef CONFIG_HIGHPTE
782 if (vmi_ops.set_linear_mapping)
783 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784#endif
785 765
786 /* 766 /*
787 * These MUST always be patched. Don't support indirect jumps 767 * These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 74c92bb194d..5e1ff66ecd7 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
79 79
80static inline unsigned int vmi_get_timer_vector(void) 80static inline unsigned int vmi_get_timer_vector(void)
81{ 81{
82#ifdef CONFIG_X86_IO_APIC 82 return IRQ0_VECTOR;
83 return FIRST_DEVICE_VECTOR;
84#else
85 return FIRST_EXTERNAL_VECTOR;
86#endif
87} 83}
88 84
89/** vmi clockchip */ 85/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
171{ 167{
172 /* Unfortunately, set_next_event interface only passes relative 168 /* Unfortunately, set_next_event interface only passes relative
173 * expiry, but we want absolute expiry. It'd be better if were 169 * expiry, but we want absolute expiry. It'd be better if were
174 * were passed an aboslute expiry, since a bunch of time may 170 * were passed an absolute expiry, since a bunch of time may
175 * have been stolen between the time the delta is computed and 171 * have been stolen between the time the delta is computed and
176 * when we set the alarm below. */ 172 * when we set the alarm below. */
177 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); 173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608c..2cc249718c4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -291,8 +291,8 @@ SECTIONS
291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
292 __smp_locks = .; 292 __smp_locks = .;
293 *(.smp_locks) 293 *(.smp_locks)
294 __smp_locks_end = .;
295 . = ALIGN(PAGE_SIZE); 294 . = ALIGN(PAGE_SIZE);
295 __smp_locks_end = .;
296 } 296 }
297 297
298#ifdef CONFIG_X86_64 298#ifdef CONFIG_X86_64
@@ -341,7 +341,7 @@ SECTIONS
341 * Per-cpu symbols which need to be offset from __per_cpu_load 341 * Per-cpu symbols which need to be offset from __per_cpu_load
342 * for the boot processor. 342 * for the boot processor.
343 */ 343 */
344#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load 344#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
345INIT_PER_CPU(gdt_page); 345INIT_PER_CPU(gdt_page);
346INIT_PER_CPU(irq_stack_union); 346INIT_PER_CPU(irq_stack_union);
347 347
@@ -352,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
352 "kernel image bigger than KERNEL_IMAGE_SIZE"); 352 "kernel image bigger than KERNEL_IMAGE_SIZE");
353 353
354#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
355. = ASSERT((per_cpu__irq_stack_union == 0), 355. = ASSERT((irq_stack_union == 0),
356 "irq_stack_union is not at start of per-cpu area"); 356 "irq_stack_union is not at start of per-cpu area");
357#endif 357#endif
358 358
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ee5746c9462..61a1e8c7e19 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,9 +4,11 @@
4 * For licencing details see kernel-base/COPYING 4 * For licencing details see kernel-base/COPYING
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h>
7 8
8#include <asm/bios_ebda.h> 9#include <asm/bios_ebda.h>
9#include <asm/paravirt.h> 10#include <asm/paravirt.h>
11#include <asm/pci_x86.h>
10#include <asm/mpspec.h> 12#include <asm/mpspec.h>
11#include <asm/setup.h> 13#include <asm/setup.h>
12#include <asm/apic.h> 14#include <asm/apic.h>
@@ -70,6 +72,12 @@ struct x86_init_ops x86_init __initdata = {
70 .iommu = { 72 .iommu = {
71 .iommu_init = iommu_init_noop, 73 .iommu_init = iommu_init_noop,
72 }, 74 },
75
76 .pci = {
77 .init = x86_default_pci_init,
78 .init_irq = x86_default_pci_init_irq,
79 .fixup_irqs = x86_default_pci_fixup_irqs,
80 },
73}; 81};
74 82
75struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 83struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {