aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-12-07 07:14:12 -0500
committerIngo Molnar <mingo@elte.hu>2009-12-07 07:14:18 -0500
commitf3d607c6b39bd9cb5000e03e2c0dc2afe1241374 (patch)
tree885b5e0b5bb3d87efc4bfbde69feff2ece32ecac /arch/x86/kernel
parent8055039c2a2454c7159dcbde3161943b757a6e0e (diff)
parent6ec22f9b037fc0c2e00ddb7023fad279c365324d (diff)
Merge branch 'linus' into x86/urgent
Merge reason: we want to queue up a dependent fix. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/processor.c3
-rw-r--r--arch/x86/kernel/amd_iommu.c1247
-rw-r--r--arch/x86/kernel/amd_iommu_init.c94
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c34
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c13
-rw-r--r--arch/x86/kernel/apic/es7000_32.c16
-rw-r--r--arch/x86/kernel/apic/io_apic.c364
-rw-r--r--arch/x86/kernel/apic/nmi.c11
-rw-r--r--arch/x86/kernel/apic/numaq_32.c13
-rw-r--r--arch/x86/kernel/apic/probe_32.c2
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c8
-rw-r--r--arch/x86/kernel/apm_32.c14
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c34
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c23
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c19
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c103
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event.c205
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpuid.c17
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/dumpstack_32.c9
-rw-r--r--arch/x86/kernel/dumpstack_64.c46
-rw-r--r--arch/x86/kernel/entry_32.S31
-rw-r--r--arch/x86/kernel/entry_64.S25
-rw-r--r--arch/x86/kernel/ftrace.c84
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hw_breakpoint.c555
-rw-r--r--arch/x86/kernel/irq.c102
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/kgdb.c9
-rw-r--r--arch/x86/kernel/kprobes.c257
-rw-r--r--arch/x86/kernel/machine_kexec_32.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/microcode_core.c2
-rw-r--r--arch/x86/kernel/msr.c16
-rw-r--r--arch/x86/kernel/pci-calgary_64.c94
-rw-r--r--arch/x86/kernel/pci-dma.c45
-rw-r--r--arch/x86/kernel/pci-gart_64.c156
-rw-r--r--arch/x86/kernel/pci-nommu.c11
-rw-r--r--arch/x86/kernel/pci-swiotlb.c18
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c10
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--arch/x86/kernel/ptrace.c415
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/signal.c9
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/traps.c73
-rw-r--r--arch/x86/kernel/tsc_sync.c13
-rw-r--r--arch/x86/kernel/uv_irq.c239
-rw-r--r--arch/x86/kernel/visws_quirks.c8
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c5
-rw-r--r--arch/x86/kernel/x86_init.c8
71 files changed, 3125 insertions, 1789 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d8e5d0cdd678..4f2e66e29ecc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index d296f4a195c9..d85d1b2432ba 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
79 struct cpuinfo_x86 *c = &cpu_data(pr->id); 79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80 80
81 pr->pdc = NULL; 81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL) 82 if (c->x86_vendor == X86_VENDOR_INTEL ||
83 c->x86_vendor == X86_VENDOR_CENTAUR)
83 init_intel_pdc(pr, c); 84 init_intel_pdc(pr, c);
84 85
85 return; 86 return;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0285521e0a99..32fb09102a13 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,115 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (!dev || dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
73#ifdef CONFIG_AMD_IOMMU_STATS 169#ifdef CONFIG_AMD_IOMMU_STATS
74 170
75/* 171/*
@@ -90,7 +186,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 186DECLARE_STATS_COUNTER(total_map_requests);
91 187
92static struct dentry *stats_dir; 188static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 189static struct dentry *de_fflush;
95 190
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 191static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +203,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 203 if (stats_dir == NULL)
109 return; 204 return;
110 205
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 206 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 207 (u32 *)&amd_iommu_unmap_flush);
116 208
@@ -130,12 +222,6 @@ static void amd_iommu_stats_init(void)
130 222
131#endif 223#endif
132 224
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 225/****************************************************************************
140 * 226 *
141 * Interrupt handling functions 227 * Interrupt handling functions
@@ -199,6 +285,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 285 break;
200 case EVENT_TYPE_ILL_CMD: 286 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 287 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
288 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 289 reset_iommu_command_buffer(iommu);
203 dump_command(address); 290 dump_command(address);
204 break; 291 break;
@@ -321,11 +408,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 408 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 409 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 410
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 411 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 412 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 413}
330 414
331/* 415/*
@@ -372,26 +456,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 456out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 457 spin_unlock_irqrestore(&iommu->lock, flags);
374 458
459 if (iommu->reset_in_progress)
460 reset_iommu_command_buffer(iommu);
461
375 return 0; 462 return 0;
376} 463}
377 464
465static void iommu_flush_complete(struct protection_domain *domain)
466{
467 int i;
468
469 for (i = 0; i < amd_iommus_present; ++i) {
470 if (!domain->dev_iommu[i])
471 continue;
472
473 /*
474 * Devices of this domain are behind this IOMMU
475 * We need to wait for completion of all commands.
476 */
477 iommu_completion_wait(amd_iommus[i]);
478 }
479}
480
378/* 481/*
379 * Command send function for invalidating a device table entry 482 * Command send function for invalidating a device table entry
380 */ 483 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 484static int iommu_flush_device(struct device *dev)
382{ 485{
486 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 487 struct iommu_cmd cmd;
384 int ret; 488 u16 devid;
385 489
386 BUG_ON(iommu == NULL); 490 devid = get_device_id(dev);
491 iommu = amd_iommu_rlookup_table[devid];
387 492
493 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 494 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 495 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 496 cmd.data[0] = devid;
391 497
392 ret = iommu_queue_command(iommu, &cmd); 498 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 499}
396 500
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 501static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +534,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 534 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 535 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 536 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 537static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 538 u64 address, size_t size, int pde)
435{ 539{
436 int s = 0; 540 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 541 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 542
439 address &= PAGE_MASK; 543 address &= PAGE_MASK;
440 544
@@ -447,142 +551,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 551 s = 1;
448 } 552 }
449 553
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 554
452 return 0; 555 for (i = 0; i < amd_iommus_present; ++i) {
556 if (!domain->dev_iommu[i])
557 continue;
558
559 /*
560 * Devices of this domain are behind this IOMMU
561 * We need a TLB flush
562 */
563 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
564 domain->id, pde, s);
565 }
566
567 return;
453} 568}
454 569
455/* Flush the whole IO/TLB for a given protection domain */ 570static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 571 u64 address, size_t size)
457{ 572{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 573 __iommu_flush_pages(domain, address, size, 0);
459 574}
460 INC_STATS_COUNTER(domain_flush_single);
461 575
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 576/* Flush the whole IO/TLB for a given protection domain */
577static void iommu_flush_tlb(struct protection_domain *domain)
578{
579 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 580}
464 581
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 582/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 583static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 584{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 585 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 586}
474 587
588
475/* 589/*
476 * This function flushes one domain on one IOMMU 590 * This function flushes the DTEs for all devices in domain
477 */ 591 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 592static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 593{
480 struct iommu_cmd cmd; 594 struct iommu_dev_data *dev_data;
481 unsigned long flags; 595 unsigned long flags;
482 596
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 597 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 598
486 spin_lock_irqsave(&iommu->lock, flags); 599 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 600 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 601
489 __iommu_wait_for_completion(iommu); 602 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 603}
492 604
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 605static void iommu_flush_all_domain_devices(void)
494{ 606{
495 int i; 607 struct protection_domain *domain;
608 unsigned long flags;
496 609
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 610 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 611
499 continue; 612 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 613 iommu_flush_domain_devices(domain);
614 iommu_flush_complete(domain);
501 } 615 }
502 616
617 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
618}
619
620void amd_iommu_flush_all_devices(void)
621{
622 iommu_flush_all_domain_devices();
503} 623}
504 624
505/* 625/*
506 * This function is used to flush the IO/TLB for a given protection domain 626 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 627 * this is no issue because it is only called during resume.
508 */ 628 */
509static void iommu_flush_domain(u16 domid) 629void amd_iommu_flush_all_domains(void)
510{ 630{
511 struct amd_iommu *iommu; 631 struct protection_domain *domain;
632 unsigned long flags;
512 633
513 INC_STATS_COUNTER(domain_flush_all); 634 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 635
515 for_each_iommu(iommu) 636 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 637 spin_lock(&domain->lock);
638 iommu_flush_tlb_pde(domain);
639 iommu_flush_complete(domain);
640 spin_unlock(&domain->lock);
641 }
642
643 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 644}
518 645
519void amd_iommu_flush_all_domains(void) 646static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 647{
521 struct amd_iommu *iommu; 648 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 649
523 for_each_iommu(iommu) 650 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 651 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
652
653 amd_iommu_reset_cmd_buffer(iommu);
654 amd_iommu_flush_all_devices();
655 amd_iommu_flush_all_domains();
656
657 iommu->reset_in_progress = false;
525} 658}
526 659
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 660/****************************************************************************
661 *
662 * The functions below are used the create the page table mappings for
663 * unity mapped regions.
664 *
665 ****************************************************************************/
666
667/*
668 * This function is used to add another level to an IO page table. Adding
669 * another level increases the size of the address space by 9 bits to a size up
670 * to 64 bits.
671 */
672static bool increase_address_space(struct protection_domain *domain,
673 gfp_t gfp)
528{ 674{
529 int i; 675 u64 *pte;
530 676
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 677 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 678 /* address space already 64 bit large */
533 continue; 679 return false;
534 680
535 iommu_queue_inv_dev_entry(iommu, i); 681 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 682 if (!pte)
537 } 683 return false;
684
685 *pte = PM_LEVEL_PDE(domain->mode,
686 virt_to_phys(domain->pt_root));
687 domain->pt_root = pte;
688 domain->mode += 1;
689 domain->updated = true;
690
691 return true;
538} 692}
539 693
540static void flush_devices_by_domain(struct protection_domain *domain) 694static u64 *alloc_pte(struct protection_domain *domain,
695 unsigned long address,
696 int end_lvl,
697 u64 **pte_page,
698 gfp_t gfp)
541{ 699{
542 struct amd_iommu *iommu; 700 u64 *pte, *page;
543 int i; 701 int level;
544 702
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 703 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 704 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 705
550 iommu = amd_iommu_rlookup_table[i]; 706 level = domain->mode - 1;
551 if (!iommu) 707 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue;
553 708
554 iommu_queue_inv_dev_entry(iommu, i); 709 while (level > end_lvl) {
555 iommu_completion_wait(iommu); 710 if (!IOMMU_PTE_PRESENT(*pte)) {
711 page = (u64 *)get_zeroed_page(gfp);
712 if (!page)
713 return NULL;
714 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
715 }
716
717 level -= 1;
718
719 pte = IOMMU_PTE_PAGE(*pte);
720
721 if (pte_page && level == end_lvl)
722 *pte_page = pte;
723
724 pte = &pte[PM_LEVEL_INDEX(level, address)];
556 } 725 }
726
727 return pte;
557} 728}
558 729
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 730/*
731 * This function checks if there is a PTE for a given dma address. If
732 * there is one, it returns the pointer to it.
733 */
734static u64 *fetch_pte(struct protection_domain *domain,
735 unsigned long address, int map_size)
560{ 736{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 737 int level;
738 u64 *pte;
562 739
563 if (iommu->reset_in_progress) 740 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 741 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 742
566 iommu->reset_in_progress = true; 743 while (level > map_size) {
744 if (!IOMMU_PTE_PRESENT(*pte))
745 return NULL;
567 746
568 amd_iommu_reset_cmd_buffer(iommu); 747 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 748
572 iommu->reset_in_progress = false; 749 pte = IOMMU_PTE_PAGE(*pte);
573} 750 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 751
575void amd_iommu_flush_all_devices(void) 752 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 753 pte = NULL;
577 flush_devices_by_domain(NULL); 754 break;
578} 755 }
756 }
579 757
580/**************************************************************************** 758 return pte;
581 * 759}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 760
587/* 761/*
588 * Generic mapping functions. It maps a physical address into a DMA 762 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +828,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 828}
655 829
656/* 830/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 831 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 832 * dma_ops domain.
681 */ 833 */
@@ -704,6 +856,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 856}
705 857
706/* 858/*
859 * Init the unity mappings for a specific IOMMU in the system
860 *
861 * Basically iterates over all unity mapping entries and applies them to
862 * the default domain DMA of that IOMMU if necessary.
863 */
864static int iommu_init_unity_mappings(struct amd_iommu *iommu)
865{
866 struct unity_map_entry *entry;
867 int ret;
868
869 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
870 if (!iommu_for_unity_map(iommu, entry))
871 continue;
872 ret = dma_ops_unity_map(iommu->default_dom, entry);
873 if (ret)
874 return ret;
875 }
876
877 return 0;
878}
879
880/*
707 * Inits the unity mappings required for a specific device 881 * Inits the unity mappings required for a specific device
708 */ 882 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 883static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +914,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 914 */
741 915
742/* 916/*
743 * This function checks if there is a PTE for a given dma address. If 917 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 918 * ranges.
745 */ 919 */
746static u64 *fetch_pte(struct protection_domain *domain, 920static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 921 unsigned long start_page,
922 unsigned int pages)
748{ 923{
749 int level; 924 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 925
761 pte = IOMMU_PTE_PAGE(*pte); 926 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 927 pages = last_page - start_page;
763 928
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 929 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 930 int index = i / APERTURE_RANGE_PAGES;
766 break; 931 int page = i % APERTURE_RANGE_PAGES;
767 } 932 __set_bit(page, dom->aperture[index]->bitmap);
768 } 933 }
769
770 return pte;
771} 934}
772 935
773/* 936/*
@@ -775,11 +938,11 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 938 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 939 * failure.
777 */ 940 */
778static int alloc_new_range(struct amd_iommu *iommu, 941static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 942 bool populate, gfp_t gfp)
781{ 943{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 944 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
945 struct amd_iommu *iommu;
783 int i; 946 int i;
784 947
785#ifdef CONFIG_IOMMU_STRESS 948#ifdef CONFIG_IOMMU_STRESS
@@ -819,14 +982,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 982 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 983
821 /* Intialize the exclusion range if necessary */ 984 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 985 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 986 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 987 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 988 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 989 unsigned long startpage;
827 iommu->exclusion_length, 990 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 991 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 992 PAGE_SIZE);
993 startpage = iommu->exclusion_start >> PAGE_SHIFT;
994 dma_ops_reserve_addresses(dma_dom, startpage, pages);
995 }
830 } 996 }
831 997
832 /* 998 /*
@@ -928,7 +1094,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1094 }
929 1095
930 if (unlikely(address == -1)) 1096 if (unlikely(address == -1))
931 address = bad_dma_address; 1097 address = DMA_ERROR_CODE;
932 1098
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1099 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1100
@@ -973,6 +1139,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1139 *
974 ****************************************************************************/ 1140 ****************************************************************************/
975 1141
1142/*
1143 * This function adds a protection domain to the global protection domain list
1144 */
1145static void add_domain_to_list(struct protection_domain *domain)
1146{
1147 unsigned long flags;
1148
1149 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1150 list_add(&domain->list, &amd_iommu_pd_list);
1151 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1152}
1153
1154/*
1155 * This function removes a protection domain to the global
1156 * protection domain list
1157 */
1158static void del_domain_from_list(struct protection_domain *domain)
1159{
1160 unsigned long flags;
1161
1162 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1163 list_del(&domain->list);
1164 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1165}
1166
976static u16 domain_id_alloc(void) 1167static u16 domain_id_alloc(void)
977{ 1168{
978 unsigned long flags; 1169 unsigned long flags;
@@ -1000,26 +1191,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1191 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1192}
1002 1193
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1194static void free_pagetable(struct protection_domain *domain)
1024{ 1195{
1025 int i, j; 1196 int i, j;
@@ -1061,6 +1232,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1232 if (!dom)
1062 return; 1233 return;
1063 1234
1235 del_domain_from_list(&dom->domain);
1236
1064 free_pagetable(&dom->domain); 1237 free_pagetable(&dom->domain);
1065 1238
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1239 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1251,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1251 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1252 * structures required for the dma_ops interface
1080 */ 1253 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1254static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1255{
1083 struct dma_ops_domain *dma_dom; 1256 struct dma_ops_domain *dma_dom;
1084 1257
@@ -1091,6 +1264,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1264 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1265 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1266 goto free_dma_dom;
1267 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1268 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1269 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1270 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1275,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1275 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1276 dma_dom->target_dev = 0xffff;
1103 1277
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1278 add_domain_to_list(&dma_dom->domain);
1279
1280 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1281 goto free_dma_dom;
1106 1282
1107 /* 1283 /*
@@ -1129,22 +1305,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1305 return domain->flags & PD_DMA_OPS_MASK;
1130} 1306}
1131 1307
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1308static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1309{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1310 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1316,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1316 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1317 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1318 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1319}
1320
1321static void clear_dte_entry(u16 devid)
1322{
1323 /* remove entry from the device table seen by the hardware */
1324 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1325 amd_iommu_dev_table[devid].data[1] = 0;
1326 amd_iommu_dev_table[devid].data[2] = 0;
1159 1327
1160 amd_iommu_pd_table[devid] = domain; 1328 amd_iommu_apply_erratum_63(devid);
1329}
1330
1331static void do_attach(struct device *dev, struct protection_domain *domain)
1332{
1333 struct iommu_dev_data *dev_data;
1334 struct amd_iommu *iommu;
1335 u16 devid;
1336
1337 devid = get_device_id(dev);
1338 iommu = amd_iommu_rlookup_table[devid];
1339 dev_data = get_dev_data(dev);
1340
1341 /* Update data structures */
1342 dev_data->domain = domain;
1343 list_add(&dev_data->list, &domain->dev_list);
1344 set_dte_entry(devid, domain);
1345
1346 /* Do reference counting */
1347 domain->dev_iommu[iommu->index] += 1;
1348 domain->dev_cnt += 1;
1349
1350 /* Flush the DTE entry */
1351 iommu_flush_device(dev);
1352}
1353
1354static void do_detach(struct device *dev)
1355{
1356 struct iommu_dev_data *dev_data;
1357 struct amd_iommu *iommu;
1358 u16 devid;
1359
1360 devid = get_device_id(dev);
1361 iommu = amd_iommu_rlookup_table[devid];
1362 dev_data = get_dev_data(dev);
1363
1364 /* decrease reference counters */
1365 dev_data->domain->dev_iommu[iommu->index] -= 1;
1366 dev_data->domain->dev_cnt -= 1;
1367
1368 /* Update data structures */
1369 dev_data->domain = NULL;
1370 list_del(&dev_data->list);
1371 clear_dte_entry(devid);
1372
1373 /* Flush the DTE entry */
1374 iommu_flush_device(dev);
1161} 1375}
1162 1376
1163/* 1377/*
1164 * If a device is not yet associated with a domain, this function does 1378 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1379 * assigns it visible for the hardware
1166 */ 1380 */
1167static void __attach_device(struct amd_iommu *iommu, 1381static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1382 struct protection_domain *domain)
1169 u16 devid)
1170{ 1383{
1384 struct iommu_dev_data *dev_data, *alias_data;
1385
1386 dev_data = get_dev_data(dev);
1387 alias_data = get_dev_data(dev_data->alias);
1388
1389 if (!alias_data)
1390 return -EINVAL;
1391
1171 /* lock domain */ 1392 /* lock domain */
1172 spin_lock(&domain->lock); 1393 spin_lock(&domain->lock);
1173 1394
1174 /* update DTE entry */ 1395 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1396 if (alias_data->domain != NULL &&
1397 alias_data->domain != domain)
1398 return -EBUSY;
1176 1399
1177 domain->dev_cnt += 1; 1400 if (dev_data->domain != NULL &&
1401 dev_data->domain != domain)
1402 return -EBUSY;
1403
1404 /* Do real assignment */
1405 if (dev_data->alias != dev) {
1406 alias_data = get_dev_data(dev_data->alias);
1407 if (alias_data->domain == NULL)
1408 do_attach(dev_data->alias, domain);
1409
1410 atomic_inc(&alias_data->bind);
1411 }
1412
1413 if (dev_data->domain == NULL)
1414 do_attach(dev, domain);
1415
1416 atomic_inc(&dev_data->bind);
1178 1417
1179 /* ready */ 1418 /* ready */
1180 spin_unlock(&domain->lock); 1419 spin_unlock(&domain->lock);
1420
1421 return 0;
1181} 1422}
1182 1423
1183/* 1424/*
1184 * If a device is not yet associated with a domain, this function does 1425 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1426 * assigns it visible for the hardware
1186 */ 1427 */
1187static void attach_device(struct amd_iommu *iommu, 1428static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1429 struct protection_domain *domain)
1189 u16 devid)
1190{ 1430{
1191 unsigned long flags; 1431 unsigned long flags;
1432 int ret;
1192 1433
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1434 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1435 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1436 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1437
1197 /* 1438 /*
@@ -1199,98 +1440,125 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1440 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1441 * here to evict all dirty stuff.
1201 */ 1442 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1443 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1444
1445 return ret;
1204} 1446}
1205 1447
1206/* 1448/*
1207 * Removes a device from a protection domain (unlocked) 1449 * Removes a device from a protection domain (unlocked)
1208 */ 1450 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1451static void __detach_device(struct device *dev)
1210{ 1452{
1453 struct iommu_dev_data *dev_data = get_dev_data(dev);
1454 struct iommu_dev_data *alias_data;
1455 unsigned long flags;
1211 1456
1212 /* lock domain */ 1457 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214
1215 /* remove domain from the lookup table */
1216 amd_iommu_pd_table[devid] = NULL;
1217 1458
1218 /* remove entry from the device table seen by the hardware */ 1459 spin_lock_irqsave(&dev_data->domain->lock, flags);
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1220 amd_iommu_dev_table[devid].data[1] = 0;
1221 amd_iommu_dev_table[devid].data[2] = 0;
1222 1460
1223 amd_iommu_apply_erratum_63(devid); 1461 if (dev_data->alias != dev) {
1462 alias_data = get_dev_data(dev_data->alias);
1463 if (atomic_dec_and_test(&alias_data->bind))
1464 do_detach(dev_data->alias);
1465 }
1224 1466
1225 /* decrease reference counter */ 1467 if (atomic_dec_and_test(&dev_data->bind))
1226 domain->dev_cnt -= 1; 1468 do_detach(dev);
1227 1469
1228 /* ready */ 1470 spin_unlock_irqrestore(&dev_data->domain->lock, flags);
1229 spin_unlock(&domain->lock);
1230 1471
1231 /* 1472 /*
1232 * If we run in passthrough mode the device must be assigned to the 1473 * If we run in passthrough mode the device must be assigned to the
1233 * passthrough domain if it is detached from any other domain 1474 * passthrough domain if it is detached from any other domain
1234 */ 1475 */
1235 if (iommu_pass_through) { 1476 if (iommu_pass_through && dev_data->domain == NULL)
1236 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1477 __attach_device(dev, pt_domain);
1237 __attach_device(iommu, pt_domain, devid);
1238 }
1239} 1478}
1240 1479
1241/* 1480/*
1242 * Removes a device from a protection domain (with devtable_lock held) 1481 * Removes a device from a protection domain (with devtable_lock held)
1243 */ 1482 */
1244static void detach_device(struct protection_domain *domain, u16 devid) 1483static void detach_device(struct device *dev)
1245{ 1484{
1246 unsigned long flags; 1485 unsigned long flags;
1247 1486
1248 /* lock device table */ 1487 /* lock device table */
1249 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1488 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1250 __detach_device(domain, devid); 1489 __detach_device(dev);
1251 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1490 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1252} 1491}
1253 1492
1493/*
1494 * Find out the protection domain structure for a given PCI device. This
1495 * will give us the pointer to the page table root for example.
1496 */
1497static struct protection_domain *domain_for_device(struct device *dev)
1498{
1499 struct protection_domain *dom;
1500 struct iommu_dev_data *dev_data, *alias_data;
1501 unsigned long flags;
1502 u16 devid, alias;
1503
1504 devid = get_device_id(dev);
1505 alias = amd_iommu_alias_table[devid];
1506 dev_data = get_dev_data(dev);
1507 alias_data = get_dev_data(dev_data->alias);
1508 if (!alias_data)
1509 return NULL;
1510
1511 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1512 dom = dev_data->domain;
1513 if (dom == NULL &&
1514 alias_data->domain != NULL) {
1515 __attach_device(dev, alias_data->domain);
1516 dom = alias_data->domain;
1517 }
1518
1519 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1520
1521 return dom;
1522}
1523
1254static int device_change_notifier(struct notifier_block *nb, 1524static int device_change_notifier(struct notifier_block *nb,
1255 unsigned long action, void *data) 1525 unsigned long action, void *data)
1256{ 1526{
1257 struct device *dev = data; 1527 struct device *dev = data;
1258 struct pci_dev *pdev = to_pci_dev(dev); 1528 u16 devid;
1259 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1260 struct protection_domain *domain; 1529 struct protection_domain *domain;
1261 struct dma_ops_domain *dma_domain; 1530 struct dma_ops_domain *dma_domain;
1262 struct amd_iommu *iommu; 1531 struct amd_iommu *iommu;
1263 unsigned long flags; 1532 unsigned long flags;
1264 1533
1265 if (devid > amd_iommu_last_bdf) 1534 if (!check_device(dev))
1266 goto out; 1535 return 0;
1267
1268 devid = amd_iommu_alias_table[devid];
1269
1270 iommu = amd_iommu_rlookup_table[devid];
1271 if (iommu == NULL)
1272 goto out;
1273
1274 domain = domain_for_device(devid);
1275 1536
1276 if (domain && !dma_ops_domain(domain)) 1537 devid = get_device_id(dev);
1277 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1538 iommu = amd_iommu_rlookup_table[devid];
1278 "to a non-dma-ops domain\n", dev_name(dev));
1279 1539
1280 switch (action) { 1540 switch (action) {
1281 case BUS_NOTIFY_UNBOUND_DRIVER: 1541 case BUS_NOTIFY_UNBOUND_DRIVER:
1542
1543 domain = domain_for_device(dev);
1544
1282 if (!domain) 1545 if (!domain)
1283 goto out; 1546 goto out;
1284 if (iommu_pass_through) 1547 if (iommu_pass_through)
1285 break; 1548 break;
1286 detach_device(domain, devid); 1549 detach_device(dev);
1287 break; 1550 break;
1288 case BUS_NOTIFY_ADD_DEVICE: 1551 case BUS_NOTIFY_ADD_DEVICE:
1552
1553 iommu_init_device(dev);
1554
1555 domain = domain_for_device(dev);
1556
1289 /* allocate a protection domain if a device is added */ 1557 /* allocate a protection domain if a device is added */
1290 dma_domain = find_protection_domain(devid); 1558 dma_domain = find_protection_domain(devid);
1291 if (dma_domain) 1559 if (dma_domain)
1292 goto out; 1560 goto out;
1293 dma_domain = dma_ops_domain_alloc(iommu); 1561 dma_domain = dma_ops_domain_alloc();
1294 if (!dma_domain) 1562 if (!dma_domain)
1295 goto out; 1563 goto out;
1296 dma_domain->target_dev = devid; 1564 dma_domain->target_dev = devid;
@@ -1300,11 +1568,15 @@ static int device_change_notifier(struct notifier_block *nb,
1300 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1568 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1301 1569
1302 break; 1570 break;
1571 case BUS_NOTIFY_DEL_DEVICE:
1572
1573 iommu_uninit_device(dev);
1574
1303 default: 1575 default:
1304 goto out; 1576 goto out;
1305 } 1577 }
1306 1578
1307 iommu_queue_inv_dev_entry(iommu, devid); 1579 iommu_flush_device(dev);
1308 iommu_completion_wait(iommu); 1580 iommu_completion_wait(iommu);
1309 1581
1310out: 1582out:
@@ -1322,106 +1594,46 @@ static struct notifier_block device_nb = {
1322 *****************************************************************************/ 1594 *****************************************************************************/
1323 1595
1324/* 1596/*
1325 * This function checks if the driver got a valid device from the caller to
1326 * avoid dereferencing invalid pointers.
1327 */
1328static bool check_device(struct device *dev)
1329{
1330 if (!dev || !dev->dma_mask)
1331 return false;
1332
1333 return true;
1334}
1335
1336/*
1337 * In this function the list of preallocated protection domains is traversed to
1338 * find the domain for a specific device
1339 */
1340static struct dma_ops_domain *find_protection_domain(u16 devid)
1341{
1342 struct dma_ops_domain *entry, *ret = NULL;
1343 unsigned long flags;
1344
1345 if (list_empty(&iommu_pd_list))
1346 return NULL;
1347
1348 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1349
1350 list_for_each_entry(entry, &iommu_pd_list, list) {
1351 if (entry->target_dev == devid) {
1352 ret = entry;
1353 break;
1354 }
1355 }
1356
1357 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1358
1359 return ret;
1360}
1361
1362/*
1363 * In the dma_ops path we only have the struct device. This function 1597 * In the dma_ops path we only have the struct device. This function
1364 * finds the corresponding IOMMU, the protection domain and the 1598 * finds the corresponding IOMMU, the protection domain and the
1365 * requestor id for a given device. 1599 * requestor id for a given device.
1366 * If the device is not yet associated with a domain this is also done 1600 * If the device is not yet associated with a domain this is also done
1367 * in this function. 1601 * in this function.
1368 */ 1602 */
1369static int get_device_resources(struct device *dev, 1603static struct protection_domain *get_domain(struct device *dev)
1370 struct amd_iommu **iommu,
1371 struct protection_domain **domain,
1372 u16 *bdf)
1373{ 1604{
1605 struct protection_domain *domain;
1374 struct dma_ops_domain *dma_dom; 1606 struct dma_ops_domain *dma_dom;
1375 struct pci_dev *pcidev; 1607 u16 devid = get_device_id(dev);
1376 u16 _bdf;
1377
1378 *iommu = NULL;
1379 *domain = NULL;
1380 *bdf = 0xffff;
1381
1382 if (dev->bus != &pci_bus_type)
1383 return 0;
1384 1608
1385 pcidev = to_pci_dev(dev); 1609 if (!check_device(dev))
1386 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 1610 return ERR_PTR(-EINVAL);
1387 1611
1388 /* device not translated by any IOMMU in the system? */ 1612 domain = domain_for_device(dev);
1389 if (_bdf > amd_iommu_last_bdf) 1613 if (domain != NULL && !dma_ops_domain(domain))
1390 return 0; 1614 return ERR_PTR(-EBUSY);
1391 1615
1392 *bdf = amd_iommu_alias_table[_bdf]; 1616 if (domain != NULL)
1617 return domain;
1393 1618
1394 *iommu = amd_iommu_rlookup_table[*bdf]; 1619 /* Device not bount yet - bind it */
1395 if (*iommu == NULL) 1620 dma_dom = find_protection_domain(devid);
1396 return 0; 1621 if (!dma_dom)
1397 *domain = domain_for_device(*bdf); 1622 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1398 if (*domain == NULL) { 1623 attach_device(dev, &dma_dom->domain);
1399 dma_dom = find_protection_domain(*bdf); 1624 DUMP_printk("Using protection domain %d for device %s\n",
1400 if (!dma_dom) 1625 dma_dom->domain.id, dev_name(dev));
1401 dma_dom = (*iommu)->default_dom;
1402 *domain = &dma_dom->domain;
1403 attach_device(*iommu, *domain, *bdf);
1404 DUMP_printk("Using protection domain %d for device %s\n",
1405 (*domain)->id, dev_name(dev));
1406 }
1407
1408 if (domain_for_device(_bdf) == NULL)
1409 attach_device(*iommu, *domain, _bdf);
1410 1626
1411 return 1; 1627 return &dma_dom->domain;
1412} 1628}
1413 1629
1414static void update_device_table(struct protection_domain *domain) 1630static void update_device_table(struct protection_domain *domain)
1415{ 1631{
1416 unsigned long flags; 1632 struct iommu_dev_data *dev_data;
1417 int i;
1418 1633
1419 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1634 list_for_each_entry(dev_data, &domain->dev_list, list) {
1420 if (amd_iommu_pd_table[i] != domain) 1635 u16 devid = get_device_id(dev_data->dev);
1421 continue; 1636 set_dte_entry(devid, domain);
1422 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1423 set_dte_entry(i, domain);
1424 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1425 } 1637 }
1426} 1638}
1427 1639
@@ -1431,76 +1643,13 @@ static void update_domain(struct protection_domain *domain)
1431 return; 1643 return;
1432 1644
1433 update_device_table(domain); 1645 update_device_table(domain);
1434 flush_devices_by_domain(domain); 1646 iommu_flush_domain_devices(domain);
1435 iommu_flush_domain(domain->id); 1647 iommu_flush_tlb_pde(domain);
1436 1648
1437 domain->updated = false; 1649 domain->updated = false;
1438} 1650}
1439 1651
1440/* 1652/*
1441 * This function is used to add another level to an IO page table. Adding
1442 * another level increases the size of the address space by 9 bits to a size up
1443 * to 64 bits.
1444 */
1445static bool increase_address_space(struct protection_domain *domain,
1446 gfp_t gfp)
1447{
1448 u64 *pte;
1449
1450 if (domain->mode == PAGE_MODE_6_LEVEL)
1451 /* address space already 64 bit large */
1452 return false;
1453
1454 pte = (void *)get_zeroed_page(gfp);
1455 if (!pte)
1456 return false;
1457
1458 *pte = PM_LEVEL_PDE(domain->mode,
1459 virt_to_phys(domain->pt_root));
1460 domain->pt_root = pte;
1461 domain->mode += 1;
1462 domain->updated = true;
1463
1464 return true;
1465}
1466
1467static u64 *alloc_pte(struct protection_domain *domain,
1468 unsigned long address,
1469 int end_lvl,
1470 u64 **pte_page,
1471 gfp_t gfp)
1472{
1473 u64 *pte, *page;
1474 int level;
1475
1476 while (address > PM_LEVEL_SIZE(domain->mode))
1477 increase_address_space(domain, gfp);
1478
1479 level = domain->mode - 1;
1480 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1481
1482 while (level > end_lvl) {
1483 if (!IOMMU_PTE_PRESENT(*pte)) {
1484 page = (u64 *)get_zeroed_page(gfp);
1485 if (!page)
1486 return NULL;
1487 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1488 }
1489
1490 level -= 1;
1491
1492 pte = IOMMU_PTE_PAGE(*pte);
1493
1494 if (pte_page && level == end_lvl)
1495 *pte_page = pte;
1496
1497 pte = &pte[PM_LEVEL_INDEX(level, address)];
1498 }
1499
1500 return pte;
1501}
1502
1503/*
1504 * This function fetches the PTE for a given address in the aperture 1653 * This function fetches the PTE for a given address in the aperture
1505 */ 1654 */
1506static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1655static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1530,8 +1679,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1530 * This is the generic map function. It maps one 4kb page at paddr to 1679 * This is the generic map function. It maps one 4kb page at paddr to
1531 * the given address in the DMA address space for the domain. 1680 * the given address in the DMA address space for the domain.
1532 */ 1681 */
1533static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1682static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1534 struct dma_ops_domain *dom,
1535 unsigned long address, 1683 unsigned long address,
1536 phys_addr_t paddr, 1684 phys_addr_t paddr,
1537 int direction) 1685 int direction)
@@ -1544,7 +1692,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1544 1692
1545 pte = dma_ops_get_pte(dom, address); 1693 pte = dma_ops_get_pte(dom, address);
1546 if (!pte) 1694 if (!pte)
1547 return bad_dma_address; 1695 return DMA_ERROR_CODE;
1548 1696
1549 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1697 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1550 1698
@@ -1565,8 +1713,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1565/* 1713/*
1566 * The generic unmapping function for on page in the DMA address space. 1714 * The generic unmapping function for on page in the DMA address space.
1567 */ 1715 */
1568static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1716static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1569 struct dma_ops_domain *dom,
1570 unsigned long address) 1717 unsigned long address)
1571{ 1718{
1572 struct aperture_range *aperture; 1719 struct aperture_range *aperture;
@@ -1597,7 +1744,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1597 * Must be called with the domain lock held. 1744 * Must be called with the domain lock held.
1598 */ 1745 */
1599static dma_addr_t __map_single(struct device *dev, 1746static dma_addr_t __map_single(struct device *dev,
1600 struct amd_iommu *iommu,
1601 struct dma_ops_domain *dma_dom, 1747 struct dma_ops_domain *dma_dom,
1602 phys_addr_t paddr, 1748 phys_addr_t paddr,
1603 size_t size, 1749 size_t size,
@@ -1625,7 +1771,7 @@ static dma_addr_t __map_single(struct device *dev,
1625retry: 1771retry:
1626 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1772 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1627 dma_mask); 1773 dma_mask);
1628 if (unlikely(address == bad_dma_address)) { 1774 if (unlikely(address == DMA_ERROR_CODE)) {
1629 /* 1775 /*
1630 * setting next_address here will let the address 1776 * setting next_address here will let the address
1631 * allocator only scan the new allocated range in the 1777 * allocator only scan the new allocated range in the
@@ -1633,7 +1779,7 @@ retry:
1633 */ 1779 */
1634 dma_dom->next_address = dma_dom->aperture_size; 1780 dma_dom->next_address = dma_dom->aperture_size;
1635 1781
1636 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1782 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1637 goto out; 1783 goto out;
1638 1784
1639 /* 1785 /*
@@ -1645,8 +1791,8 @@ retry:
1645 1791
1646 start = address; 1792 start = address;
1647 for (i = 0; i < pages; ++i) { 1793 for (i = 0; i < pages; ++i) {
1648 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1794 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1649 if (ret == bad_dma_address) 1795 if (ret == DMA_ERROR_CODE)
1650 goto out_unmap; 1796 goto out_unmap;
1651 1797
1652 paddr += PAGE_SIZE; 1798 paddr += PAGE_SIZE;
@@ -1657,10 +1803,10 @@ retry:
1657 ADD_STATS_COUNTER(alloced_io_mem, size); 1803 ADD_STATS_COUNTER(alloced_io_mem, size);
1658 1804
1659 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1805 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1660 iommu_flush_tlb(iommu, dma_dom->domain.id); 1806 iommu_flush_tlb(&dma_dom->domain);
1661 dma_dom->need_flush = false; 1807 dma_dom->need_flush = false;
1662 } else if (unlikely(iommu_has_npcache(iommu))) 1808 } else if (unlikely(amd_iommu_np_cache))
1663 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1809 iommu_flush_pages(&dma_dom->domain, address, size);
1664 1810
1665out: 1811out:
1666 return address; 1812 return address;
@@ -1669,20 +1815,19 @@ out_unmap:
1669 1815
1670 for (--i; i >= 0; --i) { 1816 for (--i; i >= 0; --i) {
1671 start -= PAGE_SIZE; 1817 start -= PAGE_SIZE;
1672 dma_ops_domain_unmap(iommu, dma_dom, start); 1818 dma_ops_domain_unmap(dma_dom, start);
1673 } 1819 }
1674 1820
1675 dma_ops_free_addresses(dma_dom, address, pages); 1821 dma_ops_free_addresses(dma_dom, address, pages);
1676 1822
1677 return bad_dma_address; 1823 return DMA_ERROR_CODE;
1678} 1824}
1679 1825
1680/* 1826/*
1681 * Does the reverse of the __map_single function. Must be called with 1827 * Does the reverse of the __map_single function. Must be called with
1682 * the domain lock held too 1828 * the domain lock held too
1683 */ 1829 */
1684static void __unmap_single(struct amd_iommu *iommu, 1830static void __unmap_single(struct dma_ops_domain *dma_dom,
1685 struct dma_ops_domain *dma_dom,
1686 dma_addr_t dma_addr, 1831 dma_addr_t dma_addr,
1687 size_t size, 1832 size_t size,
1688 int dir) 1833 int dir)
@@ -1690,7 +1835,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1690 dma_addr_t i, start; 1835 dma_addr_t i, start;
1691 unsigned int pages; 1836 unsigned int pages;
1692 1837
1693 if ((dma_addr == bad_dma_address) || 1838 if ((dma_addr == DMA_ERROR_CODE) ||
1694 (dma_addr + size > dma_dom->aperture_size)) 1839 (dma_addr + size > dma_dom->aperture_size))
1695 return; 1840 return;
1696 1841
@@ -1699,7 +1844,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1699 start = dma_addr; 1844 start = dma_addr;
1700 1845
1701 for (i = 0; i < pages; ++i) { 1846 for (i = 0; i < pages; ++i) {
1702 dma_ops_domain_unmap(iommu, dma_dom, start); 1847 dma_ops_domain_unmap(dma_dom, start);
1703 start += PAGE_SIZE; 1848 start += PAGE_SIZE;
1704 } 1849 }
1705 1850
@@ -1708,7 +1853,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1708 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1853 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1709 1854
1710 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1855 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1711 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1856 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1712 dma_dom->need_flush = false; 1857 dma_dom->need_flush = false;
1713 } 1858 }
1714} 1859}
@@ -1722,36 +1867,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1722 struct dma_attrs *attrs) 1867 struct dma_attrs *attrs)
1723{ 1868{
1724 unsigned long flags; 1869 unsigned long flags;
1725 struct amd_iommu *iommu;
1726 struct protection_domain *domain; 1870 struct protection_domain *domain;
1727 u16 devid;
1728 dma_addr_t addr; 1871 dma_addr_t addr;
1729 u64 dma_mask; 1872 u64 dma_mask;
1730 phys_addr_t paddr = page_to_phys(page) + offset; 1873 phys_addr_t paddr = page_to_phys(page) + offset;
1731 1874
1732 INC_STATS_COUNTER(cnt_map_single); 1875 INC_STATS_COUNTER(cnt_map_single);
1733 1876
1734 if (!check_device(dev)) 1877 domain = get_domain(dev);
1735 return bad_dma_address; 1878 if (PTR_ERR(domain) == -EINVAL)
1736
1737 dma_mask = *dev->dma_mask;
1738
1739 get_device_resources(dev, &iommu, &domain, &devid);
1740
1741 if (iommu == NULL || domain == NULL)
1742 /* device not handled by any AMD IOMMU */
1743 return (dma_addr_t)paddr; 1879 return (dma_addr_t)paddr;
1880 else if (IS_ERR(domain))
1881 return DMA_ERROR_CODE;
1744 1882
1745 if (!dma_ops_domain(domain)) 1883 dma_mask = *dev->dma_mask;
1746 return bad_dma_address;
1747 1884
1748 spin_lock_irqsave(&domain->lock, flags); 1885 spin_lock_irqsave(&domain->lock, flags);
1749 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1886
1887 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1750 dma_mask); 1888 dma_mask);
1751 if (addr == bad_dma_address) 1889 if (addr == DMA_ERROR_CODE)
1752 goto out; 1890 goto out;
1753 1891
1754 iommu_completion_wait(iommu); 1892 iommu_flush_complete(domain);
1755 1893
1756out: 1894out:
1757 spin_unlock_irqrestore(&domain->lock, flags); 1895 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1766,25 +1904,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1766 enum dma_data_direction dir, struct dma_attrs *attrs) 1904 enum dma_data_direction dir, struct dma_attrs *attrs)
1767{ 1905{
1768 unsigned long flags; 1906 unsigned long flags;
1769 struct amd_iommu *iommu;
1770 struct protection_domain *domain; 1907 struct protection_domain *domain;
1771 u16 devid;
1772 1908
1773 INC_STATS_COUNTER(cnt_unmap_single); 1909 INC_STATS_COUNTER(cnt_unmap_single);
1774 1910
1775 if (!check_device(dev) || 1911 domain = get_domain(dev);
1776 !get_device_resources(dev, &iommu, &domain, &devid)) 1912 if (IS_ERR(domain))
1777 /* device not handled by any AMD IOMMU */
1778 return;
1779
1780 if (!dma_ops_domain(domain))
1781 return; 1913 return;
1782 1914
1783 spin_lock_irqsave(&domain->lock, flags); 1915 spin_lock_irqsave(&domain->lock, flags);
1784 1916
1785 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1917 __unmap_single(domain->priv, dma_addr, size, dir);
1786 1918
1787 iommu_completion_wait(iommu); 1919 iommu_flush_complete(domain);
1788 1920
1789 spin_unlock_irqrestore(&domain->lock, flags); 1921 spin_unlock_irqrestore(&domain->lock, flags);
1790} 1922}
@@ -1816,9 +1948,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1816 struct dma_attrs *attrs) 1948 struct dma_attrs *attrs)
1817{ 1949{
1818 unsigned long flags; 1950 unsigned long flags;
1819 struct amd_iommu *iommu;
1820 struct protection_domain *domain; 1951 struct protection_domain *domain;
1821 u16 devid;
1822 int i; 1952 int i;
1823 struct scatterlist *s; 1953 struct scatterlist *s;
1824 phys_addr_t paddr; 1954 phys_addr_t paddr;
@@ -1827,25 +1957,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1827 1957
1828 INC_STATS_COUNTER(cnt_map_sg); 1958 INC_STATS_COUNTER(cnt_map_sg);
1829 1959
1830 if (!check_device(dev)) 1960 domain = get_domain(dev);
1961 if (PTR_ERR(domain) == -EINVAL)
1962 return map_sg_no_iommu(dev, sglist, nelems, dir);
1963 else if (IS_ERR(domain))
1831 return 0; 1964 return 0;
1832 1965
1833 dma_mask = *dev->dma_mask; 1966 dma_mask = *dev->dma_mask;
1834 1967
1835 get_device_resources(dev, &iommu, &domain, &devid);
1836
1837 if (!iommu || !domain)
1838 return map_sg_no_iommu(dev, sglist, nelems, dir);
1839
1840 if (!dma_ops_domain(domain))
1841 return 0;
1842
1843 spin_lock_irqsave(&domain->lock, flags); 1968 spin_lock_irqsave(&domain->lock, flags);
1844 1969
1845 for_each_sg(sglist, s, nelems, i) { 1970 for_each_sg(sglist, s, nelems, i) {
1846 paddr = sg_phys(s); 1971 paddr = sg_phys(s);
1847 1972
1848 s->dma_address = __map_single(dev, iommu, domain->priv, 1973 s->dma_address = __map_single(dev, domain->priv,
1849 paddr, s->length, dir, false, 1974 paddr, s->length, dir, false,
1850 dma_mask); 1975 dma_mask);
1851 1976
@@ -1856,7 +1981,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1856 goto unmap; 1981 goto unmap;
1857 } 1982 }
1858 1983
1859 iommu_completion_wait(iommu); 1984 iommu_flush_complete(domain);
1860 1985
1861out: 1986out:
1862 spin_unlock_irqrestore(&domain->lock, flags); 1987 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1865,7 +1990,7 @@ out:
1865unmap: 1990unmap:
1866 for_each_sg(sglist, s, mapped_elems, i) { 1991 for_each_sg(sglist, s, mapped_elems, i) {
1867 if (s->dma_address) 1992 if (s->dma_address)
1868 __unmap_single(iommu, domain->priv, s->dma_address, 1993 __unmap_single(domain->priv, s->dma_address,
1869 s->dma_length, dir); 1994 s->dma_length, dir);
1870 s->dma_address = s->dma_length = 0; 1995 s->dma_address = s->dma_length = 0;
1871 } 1996 }
@@ -1884,30 +2009,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1884 struct dma_attrs *attrs) 2009 struct dma_attrs *attrs)
1885{ 2010{
1886 unsigned long flags; 2011 unsigned long flags;
1887 struct amd_iommu *iommu;
1888 struct protection_domain *domain; 2012 struct protection_domain *domain;
1889 struct scatterlist *s; 2013 struct scatterlist *s;
1890 u16 devid;
1891 int i; 2014 int i;
1892 2015
1893 INC_STATS_COUNTER(cnt_unmap_sg); 2016 INC_STATS_COUNTER(cnt_unmap_sg);
1894 2017
1895 if (!check_device(dev) || 2018 domain = get_domain(dev);
1896 !get_device_resources(dev, &iommu, &domain, &devid)) 2019 if (IS_ERR(domain))
1897 return;
1898
1899 if (!dma_ops_domain(domain))
1900 return; 2020 return;
1901 2021
1902 spin_lock_irqsave(&domain->lock, flags); 2022 spin_lock_irqsave(&domain->lock, flags);
1903 2023
1904 for_each_sg(sglist, s, nelems, i) { 2024 for_each_sg(sglist, s, nelems, i) {
1905 __unmap_single(iommu, domain->priv, s->dma_address, 2025 __unmap_single(domain->priv, s->dma_address,
1906 s->dma_length, dir); 2026 s->dma_length, dir);
1907 s->dma_address = s->dma_length = 0; 2027 s->dma_address = s->dma_length = 0;
1908 } 2028 }
1909 2029
1910 iommu_completion_wait(iommu); 2030 iommu_flush_complete(domain);
1911 2031
1912 spin_unlock_irqrestore(&domain->lock, flags); 2032 spin_unlock_irqrestore(&domain->lock, flags);
1913} 2033}
@@ -1920,49 +2040,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1920{ 2040{
1921 unsigned long flags; 2041 unsigned long flags;
1922 void *virt_addr; 2042 void *virt_addr;
1923 struct amd_iommu *iommu;
1924 struct protection_domain *domain; 2043 struct protection_domain *domain;
1925 u16 devid;
1926 phys_addr_t paddr; 2044 phys_addr_t paddr;
1927 u64 dma_mask = dev->coherent_dma_mask; 2045 u64 dma_mask = dev->coherent_dma_mask;
1928 2046
1929 INC_STATS_COUNTER(cnt_alloc_coherent); 2047 INC_STATS_COUNTER(cnt_alloc_coherent);
1930 2048
1931 if (!check_device(dev)) 2049 domain = get_domain(dev);
2050 if (PTR_ERR(domain) == -EINVAL) {
2051 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2052 *dma_addr = __pa(virt_addr);
2053 return virt_addr;
2054 } else if (IS_ERR(domain))
1932 return NULL; 2055 return NULL;
1933 2056
1934 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2057 dma_mask = dev->coherent_dma_mask;
1935 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2058 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2059 flag |= __GFP_ZERO;
1936 2060
1937 flag |= __GFP_ZERO;
1938 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2061 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1939 if (!virt_addr) 2062 if (!virt_addr)
1940 return NULL; 2063 return NULL;
1941 2064
1942 paddr = virt_to_phys(virt_addr); 2065 paddr = virt_to_phys(virt_addr);
1943 2066
1944 if (!iommu || !domain) {
1945 *dma_addr = (dma_addr_t)paddr;
1946 return virt_addr;
1947 }
1948
1949 if (!dma_ops_domain(domain))
1950 goto out_free;
1951
1952 if (!dma_mask) 2067 if (!dma_mask)
1953 dma_mask = *dev->dma_mask; 2068 dma_mask = *dev->dma_mask;
1954 2069
1955 spin_lock_irqsave(&domain->lock, flags); 2070 spin_lock_irqsave(&domain->lock, flags);
1956 2071
1957 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2072 *dma_addr = __map_single(dev, domain->priv, paddr,
1958 size, DMA_BIDIRECTIONAL, true, dma_mask); 2073 size, DMA_BIDIRECTIONAL, true, dma_mask);
1959 2074
1960 if (*dma_addr == bad_dma_address) { 2075 if (*dma_addr == DMA_ERROR_CODE) {
1961 spin_unlock_irqrestore(&domain->lock, flags); 2076 spin_unlock_irqrestore(&domain->lock, flags);
1962 goto out_free; 2077 goto out_free;
1963 } 2078 }
1964 2079
1965 iommu_completion_wait(iommu); 2080 iommu_flush_complete(domain);
1966 2081
1967 spin_unlock_irqrestore(&domain->lock, flags); 2082 spin_unlock_irqrestore(&domain->lock, flags);
1968 2083
@@ -1982,28 +2097,19 @@ static void free_coherent(struct device *dev, size_t size,
1982 void *virt_addr, dma_addr_t dma_addr) 2097 void *virt_addr, dma_addr_t dma_addr)
1983{ 2098{
1984 unsigned long flags; 2099 unsigned long flags;
1985 struct amd_iommu *iommu;
1986 struct protection_domain *domain; 2100 struct protection_domain *domain;
1987 u16 devid;
1988 2101
1989 INC_STATS_COUNTER(cnt_free_coherent); 2102 INC_STATS_COUNTER(cnt_free_coherent);
1990 2103
1991 if (!check_device(dev)) 2104 domain = get_domain(dev);
1992 return; 2105 if (IS_ERR(domain))
1993
1994 get_device_resources(dev, &iommu, &domain, &devid);
1995
1996 if (!iommu || !domain)
1997 goto free_mem;
1998
1999 if (!dma_ops_domain(domain))
2000 goto free_mem; 2106 goto free_mem;
2001 2107
2002 spin_lock_irqsave(&domain->lock, flags); 2108 spin_lock_irqsave(&domain->lock, flags);
2003 2109
2004 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2110 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2005 2111
2006 iommu_completion_wait(iommu); 2112 iommu_flush_complete(domain);
2007 2113
2008 spin_unlock_irqrestore(&domain->lock, flags); 2114 spin_unlock_irqrestore(&domain->lock, flags);
2009 2115
@@ -2017,22 +2123,7 @@ free_mem:
2017 */ 2123 */
2018static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2124static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2019{ 2125{
2020 u16 bdf; 2126 return check_device(dev);
2021 struct pci_dev *pcidev;
2022
2023 /* No device or no PCI device */
2024 if (!dev || dev->bus != &pci_bus_type)
2025 return 0;
2026
2027 pcidev = to_pci_dev(dev);
2028
2029 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2030
2031 /* Out of our scope? */
2032 if (bdf > amd_iommu_last_bdf)
2033 return 0;
2034
2035 return 1;
2036} 2127}
2037 2128
2038/* 2129/*
@@ -2046,25 +2137,30 @@ static void prealloc_protection_domains(void)
2046{ 2137{
2047 struct pci_dev *dev = NULL; 2138 struct pci_dev *dev = NULL;
2048 struct dma_ops_domain *dma_dom; 2139 struct dma_ops_domain *dma_dom;
2049 struct amd_iommu *iommu;
2050 u16 devid; 2140 u16 devid;
2051 2141
2052 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2142 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2053 devid = calc_devid(dev->bus->number, dev->devfn); 2143
2054 if (devid > amd_iommu_last_bdf) 2144 /* Do we handle this device? */
2055 continue; 2145 if (!check_device(&dev->dev))
2056 devid = amd_iommu_alias_table[devid];
2057 if (domain_for_device(devid))
2058 continue; 2146 continue;
2059 iommu = amd_iommu_rlookup_table[devid]; 2147
2060 if (!iommu) 2148 iommu_init_device(&dev->dev);
2149
2150 /* Is there already any domain for it? */
2151 if (domain_for_device(&dev->dev))
2061 continue; 2152 continue;
2062 dma_dom = dma_ops_domain_alloc(iommu); 2153
2154 devid = get_device_id(&dev->dev);
2155
2156 dma_dom = dma_ops_domain_alloc();
2063 if (!dma_dom) 2157 if (!dma_dom)
2064 continue; 2158 continue;
2065 init_unity_mappings_for_device(dma_dom, devid); 2159 init_unity_mappings_for_device(dma_dom, devid);
2066 dma_dom->target_dev = devid; 2160 dma_dom->target_dev = devid;
2067 2161
2162 attach_device(&dev->dev, &dma_dom->domain);
2163
2068 list_add_tail(&dma_dom->list, &iommu_pd_list); 2164 list_add_tail(&dma_dom->list, &iommu_pd_list);
2069 } 2165 }
2070} 2166}
@@ -2093,7 +2189,7 @@ int __init amd_iommu_init_dma_ops(void)
2093 * protection domain will be assigned to the default one. 2189 * protection domain will be assigned to the default one.
2094 */ 2190 */
2095 for_each_iommu(iommu) { 2191 for_each_iommu(iommu) {
2096 iommu->default_dom = dma_ops_domain_alloc(iommu); 2192 iommu->default_dom = dma_ops_domain_alloc();
2097 if (iommu->default_dom == NULL) 2193 if (iommu->default_dom == NULL)
2098 return -ENOMEM; 2194 return -ENOMEM;
2099 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2195 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2103,15 +2199,12 @@ int __init amd_iommu_init_dma_ops(void)
2103 } 2199 }
2104 2200
2105 /* 2201 /*
2106 * If device isolation is enabled, pre-allocate the protection 2202 * Pre-allocate the protection domains for each device.
2107 * domains for each device.
2108 */ 2203 */
2109 if (amd_iommu_isolate) 2204 prealloc_protection_domains();
2110 prealloc_protection_domains();
2111 2205
2112 iommu_detected = 1; 2206 iommu_detected = 1;
2113 force_iommu = 1; 2207 swiotlb = 0;
2114 bad_dma_address = 0;
2115#ifdef CONFIG_GART_IOMMU 2208#ifdef CONFIG_GART_IOMMU
2116 gart_iommu_aperture_disabled = 1; 2209 gart_iommu_aperture_disabled = 1;
2117 gart_iommu_aperture = 0; 2210 gart_iommu_aperture = 0;
@@ -2150,14 +2243,17 @@ free_domains:
2150 2243
2151static void cleanup_domain(struct protection_domain *domain) 2244static void cleanup_domain(struct protection_domain *domain)
2152{ 2245{
2246 struct iommu_dev_data *dev_data, *next;
2153 unsigned long flags; 2247 unsigned long flags;
2154 u16 devid;
2155 2248
2156 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2249 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2157 2250
2158 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2251 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2159 if (amd_iommu_pd_table[devid] == domain) 2252 struct device *dev = dev_data->dev;
2160 __detach_device(domain, devid); 2253
2254 do_detach(dev);
2255 atomic_set(&dev_data->bind, 0);
2256 }
2161 2257
2162 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2258 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2163} 2259}
@@ -2167,6 +2263,8 @@ static void protection_domain_free(struct protection_domain *domain)
2167 if (!domain) 2263 if (!domain)
2168 return; 2264 return;
2169 2265
2266 del_domain_from_list(domain);
2267
2170 if (domain->id) 2268 if (domain->id)
2171 domain_id_free(domain->id); 2269 domain_id_free(domain->id);
2172 2270
@@ -2185,6 +2283,9 @@ static struct protection_domain *protection_domain_alloc(void)
2185 domain->id = domain_id_alloc(); 2283 domain->id = domain_id_alloc();
2186 if (!domain->id) 2284 if (!domain->id)
2187 goto out_err; 2285 goto out_err;
2286 INIT_LIST_HEAD(&domain->dev_list);
2287
2288 add_domain_to_list(domain);
2188 2289
2189 return domain; 2290 return domain;
2190 2291
@@ -2241,26 +2342,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2241static void amd_iommu_detach_device(struct iommu_domain *dom, 2342static void amd_iommu_detach_device(struct iommu_domain *dom,
2242 struct device *dev) 2343 struct device *dev)
2243{ 2344{
2244 struct protection_domain *domain = dom->priv; 2345 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2245 struct amd_iommu *iommu; 2346 struct amd_iommu *iommu;
2246 struct pci_dev *pdev;
2247 u16 devid; 2347 u16 devid;
2248 2348
2249 if (dev->bus != &pci_bus_type) 2349 if (!check_device(dev))
2250 return; 2350 return;
2251 2351
2252 pdev = to_pci_dev(dev); 2352 devid = get_device_id(dev);
2253
2254 devid = calc_devid(pdev->bus->number, pdev->devfn);
2255 2353
2256 if (devid > 0) 2354 if (dev_data->domain != NULL)
2257 detach_device(domain, devid); 2355 detach_device(dev);
2258 2356
2259 iommu = amd_iommu_rlookup_table[devid]; 2357 iommu = amd_iommu_rlookup_table[devid];
2260 if (!iommu) 2358 if (!iommu)
2261 return; 2359 return;
2262 2360
2263 iommu_queue_inv_dev_entry(iommu, devid); 2361 iommu_flush_device(dev);
2264 iommu_completion_wait(iommu); 2362 iommu_completion_wait(iommu);
2265} 2363}
2266 2364
@@ -2268,35 +2366,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2268 struct device *dev) 2366 struct device *dev)
2269{ 2367{
2270 struct protection_domain *domain = dom->priv; 2368 struct protection_domain *domain = dom->priv;
2271 struct protection_domain *old_domain; 2369 struct iommu_dev_data *dev_data;
2272 struct amd_iommu *iommu; 2370 struct amd_iommu *iommu;
2273 struct pci_dev *pdev; 2371 int ret;
2274 u16 devid; 2372 u16 devid;
2275 2373
2276 if (dev->bus != &pci_bus_type) 2374 if (!check_device(dev))
2277 return -EINVAL; 2375 return -EINVAL;
2278 2376
2279 pdev = to_pci_dev(dev); 2377 dev_data = dev->archdata.iommu;
2280 2378
2281 devid = calc_devid(pdev->bus->number, pdev->devfn); 2379 devid = get_device_id(dev);
2282
2283 if (devid >= amd_iommu_last_bdf ||
2284 devid != amd_iommu_alias_table[devid])
2285 return -EINVAL;
2286 2380
2287 iommu = amd_iommu_rlookup_table[devid]; 2381 iommu = amd_iommu_rlookup_table[devid];
2288 if (!iommu) 2382 if (!iommu)
2289 return -EINVAL; 2383 return -EINVAL;
2290 2384
2291 old_domain = domain_for_device(devid); 2385 if (dev_data->domain)
2292 if (old_domain) 2386 detach_device(dev);
2293 detach_device(old_domain, devid);
2294 2387
2295 attach_device(iommu, domain, devid); 2388 ret = attach_device(dev, domain);
2296 2389
2297 iommu_completion_wait(iommu); 2390 iommu_completion_wait(iommu);
2298 2391
2299 return 0; 2392 return ret;
2300} 2393}
2301 2394
2302static int amd_iommu_map_range(struct iommu_domain *dom, 2395static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2342,7 +2435,7 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2342 iova += PAGE_SIZE; 2435 iova += PAGE_SIZE;
2343 } 2436 }
2344 2437
2345 iommu_flush_domain(domain->id); 2438 iommu_flush_tlb_pde(domain);
2346} 2439}
2347 2440
2348static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2441static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2393,8 +2486,9 @@ static struct iommu_ops amd_iommu_ops = {
2393 2486
2394int __init amd_iommu_init_passthrough(void) 2487int __init amd_iommu_init_passthrough(void)
2395{ 2488{
2489 struct amd_iommu *iommu;
2396 struct pci_dev *dev = NULL; 2490 struct pci_dev *dev = NULL;
2397 u16 devid, devid2; 2491 u16 devid;
2398 2492
2399 /* allocate passthroug domain */ 2493 /* allocate passthroug domain */
2400 pt_domain = protection_domain_alloc(); 2494 pt_domain = protection_domain_alloc();
@@ -2404,20 +2498,17 @@ int __init amd_iommu_init_passthrough(void)
2404 pt_domain->mode |= PAGE_MODE_NONE; 2498 pt_domain->mode |= PAGE_MODE_NONE;
2405 2499
2406 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2500 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2407 struct amd_iommu *iommu;
2408 2501
2409 devid = calc_devid(dev->bus->number, dev->devfn); 2502 if (!check_device(&dev->dev))
2410 if (devid > amd_iommu_last_bdf)
2411 continue; 2503 continue;
2412 2504
2413 devid2 = amd_iommu_alias_table[devid]; 2505 devid = get_device_id(&dev->dev);
2414 2506
2415 iommu = amd_iommu_rlookup_table[devid2]; 2507 iommu = amd_iommu_rlookup_table[devid];
2416 if (!iommu) 2508 if (!iommu)
2417 continue; 2509 continue;
2418 2510
2419 __attach_device(iommu, pt_domain, devid); 2511 attach_device(&dev->dev, pt_domain);
2420 __attach_device(iommu, pt_domain, devid2);
2421 } 2512 }
2422 2513
2423 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2514 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c20001e4f556..7ffc39965233 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -25,10 +25,12 @@
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,24 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * List of protection domains - used during resume
142 */
143LIST_HEAD(amd_iommu_pd_list);
144spinlock_t amd_iommu_pd_lock;
145
138/* 146/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 147 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 148 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +165,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 165struct amd_iommu **amd_iommu_rlookup_table;
158 166
159/* 167/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 168 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 169 * to know which ones are already in use.
168 */ 170 */
@@ -838,7 +840,18 @@ static void __init free_iommu_all(void)
838static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 840static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
839{ 841{
840 spin_lock_init(&iommu->lock); 842 spin_lock_init(&iommu->lock);
843
844 /* Add IOMMU to internal data structures */
841 list_add_tail(&iommu->list, &amd_iommu_list); 845 list_add_tail(&iommu->list, &amd_iommu_list);
846 iommu->index = amd_iommus_present++;
847
848 if (unlikely(iommu->index >= MAX_IOMMUS)) {
849 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
850 return -ENOSYS;
851 }
852
853 /* Index is fine - add IOMMU to the array */
854 amd_iommus[iommu->index] = iommu;
842 855
843 /* 856 /*
844 * Copy data from ACPI table entry to the iommu struct 857 * Copy data from ACPI table entry to the iommu struct
@@ -868,6 +881,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
868 init_iommu_from_acpi(iommu, h); 881 init_iommu_from_acpi(iommu, h);
869 init_iommu_devices(iommu); 882 init_iommu_devices(iommu);
870 883
884 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
885 amd_iommu_np_cache = true;
886
871 return pci_enable_device(iommu->dev); 887 return pci_enable_device(iommu->dev);
872} 888}
873 889
@@ -925,7 +941,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
925 * 941 *
926 ****************************************************************************/ 942 ****************************************************************************/
927 943
928static int __init iommu_setup_msi(struct amd_iommu *iommu) 944static int iommu_setup_msi(struct amd_iommu *iommu)
929{ 945{
930 int r; 946 int r;
931 947
@@ -1176,19 +1192,10 @@ static struct sys_device device_amd_iommu = {
1176 * functions. Finally it prints some information about AMD IOMMUs and 1192 * functions. Finally it prints some information about AMD IOMMUs and
1177 * the driver state and enables the hardware. 1193 * the driver state and enables the hardware.
1178 */ 1194 */
1179int __init amd_iommu_init(void) 1195static int __init amd_iommu_init(void)
1180{ 1196{
1181 int i, ret = 0; 1197 int i, ret = 0;
1182 1198
1183
1184 if (no_iommu) {
1185 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1186 return 0;
1187 }
1188
1189 if (!amd_iommu_detected)
1190 return -ENODEV;
1191
1192 /* 1199 /*
1193 * First parse ACPI tables to find the largest Bus/Dev/Func 1200 * First parse ACPI tables to find the largest Bus/Dev/Func
1194 * we need to handle. Upon this information the shared data 1201 * we need to handle. Upon this information the shared data
@@ -1225,15 +1232,6 @@ int __init amd_iommu_init(void)
1225 if (amd_iommu_rlookup_table == NULL) 1232 if (amd_iommu_rlookup_table == NULL)
1226 goto free; 1233 goto free;
1227 1234
1228 /*
1229 * Protection Domain table - maps devices to protection domains
1230 * This table has the same size as the rlookup_table
1231 */
1232 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1233 get_order(rlookup_table_size));
1234 if (amd_iommu_pd_table == NULL)
1235 goto free;
1236
1237 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1235 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1238 GFP_KERNEL | __GFP_ZERO, 1236 GFP_KERNEL | __GFP_ZERO,
1239 get_order(MAX_DOMAIN_ID/8)); 1237 get_order(MAX_DOMAIN_ID/8));
@@ -1255,6 +1253,8 @@ int __init amd_iommu_init(void)
1255 */ 1253 */
1256 amd_iommu_pd_alloc_bitmap[0] = 1; 1254 amd_iommu_pd_alloc_bitmap[0] = 1;
1257 1255
1256 spin_lock_init(&amd_iommu_pd_lock);
1257
1258 /* 1258 /*
1259 * now the data structures are allocated and basically initialized 1259 * now the data structures are allocated and basically initialized
1260 * start the real acpi table scan 1260 * start the real acpi table scan
@@ -1286,17 +1286,12 @@ int __init amd_iommu_init(void)
1286 if (iommu_pass_through) 1286 if (iommu_pass_through)
1287 goto out; 1287 goto out;
1288 1288
1289 printk(KERN_INFO "AMD-Vi: device isolation ");
1290 if (amd_iommu_isolate)
1291 printk("enabled\n");
1292 else
1293 printk("disabled\n");
1294
1295 if (amd_iommu_unmap_flush) 1289 if (amd_iommu_unmap_flush)
1296 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1290 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1297 else 1291 else
1298 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1292 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1299 1293
1294 x86_platform.iommu_shutdown = disable_iommus;
1300out: 1295out:
1301 return ret; 1296 return ret;
1302 1297
@@ -1304,9 +1299,6 @@ free:
1304 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1299 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1305 get_order(MAX_DOMAIN_ID/8)); 1300 get_order(MAX_DOMAIN_ID/8));
1306 1301
1307 free_pages((unsigned long)amd_iommu_pd_table,
1308 get_order(rlookup_table_size));
1309
1310 free_pages((unsigned long)amd_iommu_rlookup_table, 1302 free_pages((unsigned long)amd_iommu_rlookup_table,
1311 get_order(rlookup_table_size)); 1303 get_order(rlookup_table_size));
1312 1304
@@ -1323,11 +1315,6 @@ free:
1323 goto out; 1315 goto out;
1324} 1316}
1325 1317
1326void amd_iommu_shutdown(void)
1327{
1328 disable_iommus();
1329}
1330
1331/**************************************************************************** 1318/****************************************************************************
1332 * 1319 *
1333 * Early detect code. This code runs at IOMMU detection time in the DMA 1320 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1342,16 +1329,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1342 1329
1343void __init amd_iommu_detect(void) 1330void __init amd_iommu_detect(void)
1344{ 1331{
1345 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1332 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1346 return; 1333 return;
1347 1334
1348 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1335 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1349 iommu_detected = 1; 1336 iommu_detected = 1;
1350 amd_iommu_detected = 1; 1337 amd_iommu_detected = 1;
1351#ifdef CONFIG_GART_IOMMU 1338 x86_init.iommu.iommu_init = amd_iommu_init;
1352 gart_iommu_aperture_disabled = 1;
1353 gart_iommu_aperture = 0;
1354#endif
1355 } 1339 }
1356} 1340}
1357 1341
@@ -1372,10 +1356,6 @@ static int __init parse_amd_iommu_dump(char *str)
1372static int __init parse_amd_iommu_options(char *str) 1356static int __init parse_amd_iommu_options(char *str)
1373{ 1357{
1374 for (; *str; ++str) { 1358 for (; *str; ++str) {
1375 if (strncmp(str, "isolate", 7) == 0)
1376 amd_iommu_isolate = true;
1377 if (strncmp(str, "share", 5) == 0)
1378 amd_iommu_isolate = false;
1379 if (strncmp(str, "fullflush", 9) == 0) 1359 if (strncmp(str, "fullflush", 9) == 0)
1380 amd_iommu_unmap_flush = true; 1360 amd_iommu_unmap_flush = true;
1381 } 1361 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..e0dfb6856aa2 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
33int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
@@ -400,6 +401,7 @@ void __init gart_iommu_hole_init(void)
400 401
401 iommu_detected = 1; 402 iommu_detected = 1;
402 gart_iommu_aperture = 1; 403 gart_iommu_aperture = 1;
404 x86_init.iommu.iommu_init = gart_iommu_init;
403 405
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 406 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 407 aper_size = (32 * 1024 * 1024) << aper_order;
@@ -456,7 +458,7 @@ out:
456 458
457 if (aper_alloc) { 459 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 460 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) { 461 } else if (!valid_agp) {
460 /* Do nothing */ 462 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 463 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 464 force_iommu ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..ad8c75b9e453 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -241,28 +241,13 @@ static int modern_apic(void)
241} 241}
242 242
243/* 243/*
244 * bare function to substitute write operation 244 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 245 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 246 */
262void apic_disable(void) 247void apic_disable(void)
263{ 248{
264 apic->read = native_apic_read_dummy; 249 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 250 apic = &apic_noop;
266} 251}
267 252
268void native_apic_wait_icr_idle(void) 253void native_apic_wait_icr_idle(void)
@@ -459,7 +444,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 444 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 445 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 446 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 447 apic_write(APIC_TMICT, 0);
463 break; 448 break;
464 case CLOCK_EVT_MODE_RESUME: 449 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 450 /* Nothing to do here */
@@ -1392,14 +1377,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1377 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1378 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1379 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1380 int dmar_table_init_ret;
1396 1381
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1382 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1383 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1384 return;
1401 dmar_table_init_ret);
1402#endif
1403 1385
1404 ioapic_entries = alloc_ioapic_entries(); 1386 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1387 if (!ioapic_entries) {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..d9acc3bee0f4
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..38dcecfa5818 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -230,7 +225,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 225 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 226 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 227 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 228 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 229 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 230 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 231 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..e85f8fb7f8e7 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -466,11 +466,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 466 return cpumask_of(smp_processor_id());
467} 467}
468 468
469static unsigned long 469static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 470{
472 return 0; 471 return 0;
473} 472}
473
474static unsigned long es7000_check_apicid_present(int bit) 474static unsigned long es7000_check_apicid_present(int bit)
475{ 475{
476 return physid_isset(bit, phys_cpu_present_map); 476 return physid_isset(bit, phys_cpu_present_map);
@@ -539,14 +539,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 539
540static int cpu_id; 540static int cpu_id;
541 541
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 542static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 543{
544 physid_mask_t mask; 544 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 545 ++cpu_id;
548
549 return mask;
550} 546}
551 547
552/* Mapping from cpu number to logical apicid */ 548/* Mapping from cpu number to logical apicid */
@@ -561,10 +557,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 557#endif
562} 558}
563 559
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 560static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 561{
566 /* For clustered we don't have a good way to do this yet - hack */ 562 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 563 physids_promote(0xFFL, retmap);
568} 564}
569 565
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 566static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..c0b4468683f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -60,8 +60,6 @@
60#include <asm/irq_remapping.h> 60#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 61#include <asm/hpet.h>
62#include <asm/hw_irq.h> 62#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 63
66#include <asm/apic.h> 64#include <asm/apic.h>
67 65
@@ -140,20 +138,6 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 138 return pin;
141} 139}
142 140
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 141/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 142#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 143static struct irq_cfg irq_cfgx[] = {
@@ -209,7 +193,7 @@ int __init arch_early_irq_init(void)
209} 193}
210 194
211#ifdef CONFIG_SPARSE_IRQ 195#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 196struct irq_cfg *irq_cfg(unsigned int irq)
213{ 197{
214 struct irq_cfg *cfg = NULL; 198 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 199 struct irq_desc *desc;
@@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 345/* end for move_irq_desc */
362 346
363#else 347#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 348struct irq_cfg *irq_cfg(unsigned int irq)
365{ 349{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 350 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 351}
@@ -555,23 +539,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 539 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 540}
557 541
542static void __io_apic_modify_irq(struct irq_pin_list *entry,
543 int mask_and, int mask_or,
544 void (*final)(struct irq_pin_list *entry))
545{
546 unsigned int reg, pin;
547
548 pin = entry->pin;
549 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
550 reg &= mask_and;
551 reg |= mask_or;
552 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
553 if (final)
554 final(entry);
555}
556
558static void io_apic_modify_irq(struct irq_cfg *cfg, 557static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 558 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 559 void (*final)(struct irq_pin_list *entry))
561{ 560{
562 int pin;
563 struct irq_pin_list *entry; 561 struct irq_pin_list *entry;
564 562
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 563 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 564 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 565}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 566
569 reg &= mask_and; 567static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 568{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 569 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 570 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 571}
574 } 572
573static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
574{
575 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
576 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 577}
576 578
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 579static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +597,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 597 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 598}
597 599
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 600static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 601{
612 struct irq_cfg *cfg = desc->chip_data; 602 struct irq_cfg *cfg = desc->chip_data;
@@ -1177,7 +1167,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1177 int cpu, err; 1167 int cpu, err;
1178 cpumask_var_t tmp_mask; 1168 cpumask_var_t tmp_mask;
1179 1169
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1170 if (cfg->move_in_progress)
1181 return -EBUSY; 1171 return -EBUSY;
1182 1172
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1173 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1237,8 +1227,7 @@ next:
1237 return err; 1227 return err;
1238} 1228}
1239 1229
1240static int 1230int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1231{
1243 int err; 1232 int err;
1244 unsigned long flags; 1233 unsigned long flags;
@@ -1599,9 +1588,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1588 struct irq_desc *desc;
1600 unsigned int irq; 1589 unsigned int irq;
1601 1590
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1591 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1592 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1593 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1708,9 +1694,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1694{
1709 int i; 1695 int i;
1710 1696
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1697 printk(KERN_DEBUG);
1715 1698
1716 for (i = 0; i < 8; i++) 1699 for (i = 0; i < 8; i++)
@@ -1724,9 +1707,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1707 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1708 u64 icr;
1726 1709
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1710 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1711 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1712 v = apic_read(APIC_ID);
@@ -1824,13 +1804,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1804 printk("\n");
1825} 1805}
1826 1806
1827__apicdebuginit(void) print_all_local_APICs(void) 1807__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1808{
1829 int cpu; 1809 int cpu;
1830 1810
1811 if (!maxcpu)
1812 return;
1813
1831 preempt_disable(); 1814 preempt_disable();
1832 for_each_online_cpu(cpu) 1815 for_each_online_cpu(cpu) {
1816 if (cpu >= maxcpu)
1817 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1818 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1819 }
1834 preempt_enable(); 1820 preempt_enable();
1835} 1821}
1836 1822
@@ -1839,7 +1825,7 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1825 unsigned int v;
1840 unsigned long flags; 1826 unsigned long flags;
1841 1827
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1828 if (!nr_legacy_irqs)
1843 return; 1829 return;
1844 1830
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1831 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1866,21 +1852,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1852 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1853}
1868 1854
1869__apicdebuginit(int) print_all_ICs(void) 1855static int __initdata show_lapic = 1;
1856static __init int setup_show_lapic(char *arg)
1870{ 1857{
1858 int num = -1;
1859
1860 if (strcmp(arg, "all") == 0) {
1861 show_lapic = CONFIG_NR_CPUS;
1862 } else {
1863 get_option(&arg, &num);
1864 if (num >= 0)
1865 show_lapic = num;
1866 }
1867
1868 return 1;
1869}
1870__setup("show_lapic=", setup_show_lapic);
1871
1872__apicdebuginit(int) print_ICs(void)
1873{
1874 if (apic_verbosity == APIC_QUIET)
1875 return 0;
1876
1871 print_PIC(); 1877 print_PIC();
1872 1878
1873 /* don't print out if apic is not there */ 1879 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1880 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1881 return 0;
1876 1882
1877 print_all_local_APICs(); 1883 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1884 print_IO_APIC();
1879 1885
1880 return 0; 1886 return 0;
1881} 1887}
1882 1888
1883fs_initcall(print_all_ICs); 1889fs_initcall(print_ICs);
1884 1890
1885 1891
1886/* Where if anywhere is the i8259 connect in external int mode */ 1892/* Where if anywhere is the i8259 connect in external int mode */
@@ -2031,7 +2037,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2037 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2038 * circumvent this idiocy regardless.
2033 */ 2039 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2040 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2041
2036 /* 2042 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2043 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2058,7 +2064,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2064 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2065 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2066 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2067 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2068 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2069 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2070 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2079,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2079 mp_ioapics[apic_id].apicid = i;
2074 } else { 2080 } else {
2075 physid_mask_t tmp; 2081 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2082 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2083 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2084 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2085 mp_ioapics[apic_id].apicid);
@@ -2228,20 +2234,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2234 */
2229 2235
2230#ifdef CONFIG_SMP 2236#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2237void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2238{
2233 cpumask_var_t cleanup_mask; 2239 cpumask_var_t cleanup_mask;
2234 2240
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2241 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2242 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2243 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2244 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2245 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2246 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2247 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2248 free_cpumask_var(cleanup_mask);
2247 } 2249 }
@@ -2272,15 +2274,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2274 }
2273} 2275}
2274 2276
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2277/*
2279 * Either sets desc->affinity to a valid value, and returns 2278 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2279 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
2281 * leaves desc->affinity untouched. 2280 * leaves desc->affinity untouched.
2282 */ 2281 */
2283static unsigned int 2282unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2283set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
2285{ 2284{
2286 struct irq_cfg *cfg; 2285 struct irq_cfg *cfg;
@@ -2433,8 +2432,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2433 2432
2434 cfg = irq_cfg(irq); 2433 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2434 spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count)
2437 goto unlock;
2438 2435
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2436 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2440 goto unlock; 2437 goto unlock;
@@ -2452,7 +2449,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2449 goto unlock;
2453 } 2450 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2451 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2452unlock:
2457 spin_unlock(&desc->lock); 2453 spin_unlock(&desc->lock);
2458 } 2454 }
@@ -2460,21 +2456,33 @@ unlock:
2460 irq_exit(); 2456 irq_exit();
2461} 2457}
2462 2458
2463static void irq_complete_move(struct irq_desc **descp) 2459static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2460{
2465 struct irq_desc *desc = *descp; 2461 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2462 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2463 unsigned me;
2468 2464
2469 if (likely(!cfg->move_in_progress)) 2465 if (likely(!cfg->move_in_progress))
2470 return; 2466 return;
2471 2467
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2468 me = smp_processor_id();
2474 2469
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2470 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2471 send_cleanup_vector(cfg);
2477} 2472}
2473
2474static void irq_complete_move(struct irq_desc **descp)
2475{
2476 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2477}
2478
2479void irq_force_complete_move(int irq)
2480{
2481 struct irq_desc *desc = irq_to_desc(irq);
2482 struct irq_cfg *cfg = desc->chip_data;
2483
2484 __irq_complete_move(&desc, cfg->vector);
2485}
2478#else 2486#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2487static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2488#endif
@@ -2490,6 +2498,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2498
2491atomic_t irq_mis_count; 2499atomic_t irq_mis_count;
2492 2500
2501/*
2502 * IO-APIC versions below 0x20 don't support EOI register.
2503 * For the record, here is the information about various versions:
2504 * 0Xh 82489DX
2505 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2506 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2507 * 30h-FFh Reserved
2508 *
2509 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2510 * version as 0x2. This is an error with documentation and these ICH chips
2511 * use io-apic's of version 0x20.
2512 *
2513 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2514 * Otherwise, we simulate the EOI message manually by changing the trigger
2515 * mode to edge and then back to level, with RTE being masked during this.
2516*/
2517static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2518{
2519 struct irq_pin_list *entry;
2520
2521 for_each_irq_pin(entry, cfg->irq_2_pin) {
2522 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2523 /*
2524 * Intr-remapping uses pin number as the virtual vector
2525 * in the RTE. Actual vector is programmed in
2526 * intr-remapping table entry. Hence for the io-apic
2527 * EOI we use the pin number.
2528 */
2529 if (irq_remapped(irq))
2530 io_apic_eoi(entry->apic, entry->pin);
2531 else
2532 io_apic_eoi(entry->apic, cfg->vector);
2533 } else {
2534 __mask_and_edge_IO_APIC_irq(entry);
2535 __unmask_and_level_IO_APIC_irq(entry);
2536 }
2537 }
2538}
2539
2540static void eoi_ioapic_irq(struct irq_desc *desc)
2541{
2542 struct irq_cfg *cfg;
2543 unsigned long flags;
2544 unsigned int irq;
2545
2546 irq = desc->irq;
2547 cfg = desc->chip_data;
2548
2549 spin_lock_irqsave(&ioapic_lock, flags);
2550 __eoi_ioapic_irq(irq, cfg);
2551 spin_unlock_irqrestore(&ioapic_lock, flags);
2552}
2553
2493static void ack_apic_level(unsigned int irq) 2554static void ack_apic_level(unsigned int irq)
2494{ 2555{
2495 struct irq_desc *desc = irq_to_desc(irq); 2556 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2586,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2586 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2587 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2588 * The idea is from Manfred Spraul. --macro
2589 *
2590 * Also in the case when cpu goes offline, fixup_irqs() will forward
2591 * any unhandled interrupt on the offlined cpu to the new cpu
2592 * destination that is handling the corresponding interrupt. This
2593 * interrupt forwarding is done via IPI's. Hence, in this case also
2594 * level-triggered io-apic interrupt will be seen as an edge
2595 * interrupt in the IRR. And we can't rely on the cpu's EOI
2596 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2597 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2598 * supporting EOI register, we do an explicit EOI to clear the
2599 * remote IRR and on IO-APIC's which don't have an EOI register,
2600 * we use the above logic (mask+edge followed by unmask+level) from
2601 * Manfred Spraul to clear the remote IRR.
2528 */ 2602 */
2529 cfg = desc->chip_data; 2603 cfg = desc->chip_data;
2530 i = cfg->vector; 2604 i = cfg->vector;
@@ -2536,6 +2610,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2610 */
2537 ack_APIC_irq(); 2611 ack_APIC_irq();
2538 2612
2613 /*
2614 * Tail end of clearing remote IRR bit (either by delivering the EOI
2615 * message via io-apic EOI register write or simulating it using
2616 * mask+edge followed by unnask+level logic) manually when the
2617 * level triggered interrupt is seen as the edge triggered interrupt
2618 * at the cpu.
2619 */
2620 if (!(v & (1 << (i & 0x1f)))) {
2621 atomic_inc(&irq_mis_count);
2622
2623 eoi_ioapic_irq(desc);
2624 }
2625
2539 /* Now we can move and renable the irq */ 2626 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2627 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2628 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2656,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2656 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2657 unmask_IO_APIC_irq_desc(desc);
2571 } 2658 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2659}
2582 2660
2583#ifdef CONFIG_INTR_REMAP 2661#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2662static void ir_ack_apic_edge(unsigned int irq)
2608{ 2663{
2609 ack_APIC_irq(); 2664 ack_APIC_irq();
@@ -3157,6 +3212,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3212 continue;
3158 3213
3159 desc_new = move_irq_desc(desc_new, node); 3214 desc_new = move_irq_desc(desc_new, node);
3215 cfg_new = desc_new->chip_data;
3160 3216
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3217 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3218 irq = new;
@@ -3708,75 +3764,6 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3764}
3709#endif /* CONFIG_HT_IRQ */ 3765#endif /* CONFIG_HT_IRQ */
3710 3766
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3767int __init io_apic_get_redir_entries (int ioapic)
3781{ 3768{
3782 union IO_APIC_reg_01 reg_01; 3769 union IO_APIC_reg_01 reg_01;
@@ -3944,7 +3931,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 3931 */
3945 3932
3946 if (physids_empty(apic_id_map)) 3933 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 3934 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 3935
3949 spin_lock_irqsave(&ioapic_lock, flags); 3936 spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 3937 reg_00.raw = io_apic_read(ioapic, 0);
@@ -3960,10 +3947,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 3947 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 3948 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 3949 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 3950 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 3951
3965 for (i = 0; i < get_physical_broadcast(); i++) { 3952 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 3953 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 3954 break;
3968 } 3955 }
3969 3956
@@ -3976,7 +3963,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 3963 apic_id = i;
3977 } 3964 }
3978 3965
3979 tmp = apic->apicid_to_cpu_present(apic_id); 3966 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 3967 physids_or(apic_id_map, apic_id_map, tmp);
3981 3968
3982 if (reg_00.bits.ID != apic_id) { 3969 if (reg_00.bits.ID != apic_id) {
@@ -4106,7 +4093,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4093 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4094 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4095 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4096 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4097 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4098 }
4112 4099
@@ -4140,18 +4127,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4127#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4128fake_ioapic_page:
4142#endif 4129#endif
4143 ioapic_phys = (unsigned long) 4130 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4131 ioapic_phys = __pa(ioapic_phys);
4146 } 4132 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4133 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4134 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4135 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4136 ioapic_phys);
4151 idx++; 4137 idx++;
4152 4138
4153 ioapic_res->start = ioapic_phys; 4139 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4140 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4141 ioapic_res++;
4156 } 4142 }
4157} 4143}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6a188a..6389432a9dbf 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -39,7 +39,8 @@
39int unknown_nmi_panic; 39int unknown_nmi_panic;
40int nmi_watchdog_enabled; 40int nmi_watchdog_enabled;
41 41
42static cpumask_t backtrace_mask __read_mostly; 42/* For reliability, we're prepared to waste bits here. */
43static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 44
44/* nmi_active: 45/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 46 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -414,7 +415,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 415 }
415 416
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 417 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 418 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 419 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
419 420
420 spin_lock(&lock); 421 spin_lock(&lock);
@@ -422,7 +423,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
422 show_regs(regs); 423 show_regs(regs);
423 dump_stack(); 424 dump_stack();
424 spin_unlock(&lock); 425 spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 426 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 427
427 rc = 1; 428 rc = 1;
428 } 429 }
@@ -558,14 +559,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 559{
559 int i; 560 int i;
560 561
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 562 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 563
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 564 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 565 apic->send_IPI_all(NMI_VECTOR);
565 566
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 567 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 568 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 569 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 570 break;
570 mdelay(1); 571 mdelay(1);
571 } 572 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2b8505..07cdbdcd7a92 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -334,10 +334,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
334 return cpu_all_mask; 334 return cpu_all_mask;
335} 335}
336 336
337static inline unsigned long 337static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
338numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
339{ 338{
340 return physid_isset(apicid, bitmap); 339 return physid_isset(apicid, *map);
341} 340}
342 341
343static inline unsigned long numaq_check_apicid_present(int bit) 342static inline unsigned long numaq_check_apicid_present(int bit)
@@ -371,10 +370,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
371 return apic != 0 && irq == 0; 370 return apic != 0 && irq == 0;
372} 371}
373 372
374static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 373static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
375{ 374{
376 /* We don't have a good way to do this yet - hack */ 375 /* We don't have a good way to do this yet - hack */
377 return physids_promote(0xFUL); 376 return physids_promote(0xFUL, retmap);
378} 377}
379 378
380static inline int numaq_cpu_to_logical_apicid(int cpu) 379static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -402,12 +401,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
402 return logical_apicid >> 4; 401 return logical_apicid >> 4;
403} 402}
404 403
405static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 404static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
406{ 405{
407 int node = numaq_apicid_to_node(logical_apicid); 406 int node = numaq_apicid_to_node(logical_apicid);
408 int cpu = __ffs(logical_apicid & 0xf); 407 int cpu = __ffs(logical_apicid & 0xf);
409 408
410 return physid_mask_of_physid(cpu + 4*node); 409 physid_set_mask_of_physid(cpu + 4*node, retmap);
411} 410}
412 411
413/* Where the IO area was mapped on multiquad, always 0 otherwise */ 412/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..1a6559f6768c 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -108,7 +108,7 @@ struct apic apic_default = {
108 .apicid_to_node = default_apicid_to_node, 108 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 110 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 111 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 112 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 113 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 114 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 326c25477d3d..130c4b934877 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -409,6 +409,12 @@ static __init void map_mmioh_high(int max_pnode)
409 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 409 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
410} 410}
411 411
412static __init void map_low_mmrs(void)
413{
414 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
415 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
416}
417
412static __init void uv_rtc_init(void) 418static __init void uv_rtc_init(void)
413{ 419{
414 long status; 420 long status;
@@ -550,6 +556,8 @@ void __init uv_system_init(void)
550 unsigned long mmr_base, present, paddr; 556 unsigned long mmr_base, present, paddr;
551 unsigned short pnode_mask; 557 unsigned short pnode_mask;
552 558
559 map_low_mmrs();
560
553 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 561 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
554 m_val = m_n_config.s.m_skt; 562 m_val = m_n_config.s.m_skt;
555 n_val = m_n_config.s.n_skt; 563 n_val = m_n_config.s.n_skt;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..b5b6b23bce53 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..1d2cb383410e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..7128b3799cec 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -535,7 +535,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 535 }
536 } 536 }
537 537
538 display_cacheinfo(c); 538 cpu_detect_cache_sizes(c);
539 539
540 /* Multi core CPU? */ 540 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 541 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc25c2b4a567..a4ec8b647544 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -659,24 +654,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 654 const struct cpu_dev *const *cdev;
660 int count = 0; 655 int count = 0;
661 656
657#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 658 printk(KERN_INFO "KERNEL supported cpus:\n");
659#endif
660
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 661 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 662 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 663
667 if (count >= X86_VENDOR_NUM) 664 if (count >= X86_VENDOR_NUM)
668 break; 665 break;
669 cpu_devs[count] = cpudev; 666 cpu_devs[count] = cpudev;
670 count++; 667 count++;
671 668
672 for (j = 0; j < 2; j++) { 669#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 670 {
674 continue; 671 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 672
676 cpudev->c_ident[j]); 673 for (j = 0; j < 2; j++) {
674 if (!cpudev->c_ident[j])
675 continue;
676 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
677 cpudev->c_ident[j]);
678 }
677 } 679 }
680#endif
678 } 681 }
679
680 early_identify_cpu(&boot_cpu_data); 682 early_identify_cpu(&boot_cpu_data);
681} 683}
682 684
@@ -837,10 +839,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 839 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 840 }
839 841
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 842 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 843 mcheck_cpu_init(c);
843#endif
844 844
845 select_idle_routine(c); 845 select_idle_routine(c);
846 846
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7d5c3b0ea8da..8b581d3905cb 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -526,15 +526,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
526 526
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) 527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{ 528{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf 529 /* Intel Xeon Processor 7100 Series Specification Update
530 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an 531 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause 532 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/ 533 * Both Processor Cores to Lock Up. */
533 if (c->x86_vendor == X86_VENDOR_INTEL) { 534 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) && 535 if ((c->x86 == 15) &&
535 (c->x86_model == 6) && 536 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable()) 537 (c->x86_mask == 8)) {
538 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
539 "Xeon(R) 7100 Errata AL30, processors may "
540 "lock up on frequency changes: disabling "
541 "acpi-cpufreq.\n");
537 return -ENODEV; 542 return -ENODEV;
543 }
538 } 544 }
539 return 0; 545 return 0;
540} 546}
@@ -549,13 +555,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
549 unsigned int result = 0; 555 unsigned int result = 0;
550 struct cpuinfo_x86 *c = &cpu_data(policy->cpu); 556 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
551 struct acpi_processor_performance *perf; 557 struct acpi_processor_performance *perf;
558#ifdef CONFIG_SMP
559 static int blacklisted;
560#endif
552 561
553 dprintk("acpi_cpufreq_cpu_init\n"); 562 dprintk("acpi_cpufreq_cpu_init\n");
554 563
555#ifdef CONFIG_SMP 564#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c); 565 if (blacklisted)
557 if (result) 566 return blacklisted;
558 return result; 567 blacklisted = acpi_cpufreq_blacklist(c);
568 if (blacklisted)
569 return blacklisted;
559#endif 570#endif
560 571
561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 572 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index ce2ed3e4aad9..cabd2fa3fc93 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); 813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break; 814 break;
815 case 1 ... 15: 815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V1; 816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) { 817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2; 818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]"; 819 cpuname = "C3 'Samuel 2' [C5B]";
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6394aa5c7985..3f12dabeab52 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data)
1022 * set it to 1 to avoid problems in the future. 1022 * set it to 1 to avoid problems in the future.
1023 * For all others it's a BIOS bug. 1023 * For all others it's a BIOS bug.
1024 */ 1024 */
1025 if (!boot_cpu_data.x86 == 0x11) 1025 if (boot_cpu_data.x86 != 0x11)
1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition " 1026 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1027 "latency\n"); 1027 "latency\n");
1028 max_latency = 1; 1028 max_latency = 1;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 6911e91fb4f6..3ae5a7a3a500 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void)
232 return 0; 232 return 0;
233} 233}
234 234
235struct get_freq_data { 235static void get_freq_data(void *_speed)
236 unsigned int speed;
237 unsigned int processor;
238};
239
240static void get_freq_data(void *_data)
241{ 236{
242 struct get_freq_data *data = _data; 237 unsigned int *speed = _speed;
243 238
244 data->speed = speedstep_get_frequency(data->processor); 239 *speed = speedstep_get_frequency(speedstep_processor);
245} 240}
246 241
247static unsigned int speedstep_get(unsigned int cpu) 242static unsigned int speedstep_get(unsigned int cpu)
248{ 243{
249 struct get_freq_data data = { .processor = cpu }; 244 unsigned int speed;
250 245
251 /* You're supposed to ensure CPU is online. */ 246 /* You're supposed to ensure CPU is online. */
252 if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) 247 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
253 BUG(); 248 BUG();
254 249
255 dprintk("detected %u kHz as current frequency\n", data.speed); 250 dprintk("detected %u kHz as current frequency\n", speed);
256 return data.speed; 251 return speed;
257} 252}
258 253
259/** 254/**
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 8178d0352935..6c40f6b5b340 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -491,22 +491,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
491#endif 491#endif
492 } 492 }
493 493
494 if (trace)
495 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
496 else if (l1i)
497 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
498
499 if (l1d)
500 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
501 else
502 printk(KERN_CONT "\n");
503
504 if (l2)
505 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
506
507 if (l3)
508 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
509
510 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 494 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
511 495
512 return l2; 496 return l2;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4825a3d6eb40..d7ebf25d10ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -85,18 +88,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 88static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 89static int cpu_missing;
87 90
88static void default_decode_mce(struct mce *m) 91/*
92 * CPU/chipset specific EDAC code can register a notifier call here to print
93 * MCE errors in a human-readable form.
94 */
95ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
96EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
97
98static int default_decode_mce(struct notifier_block *nb, unsigned long val,
99 void *data)
89{ 100{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 101 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 102 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
103
104 return NOTIFY_STOP;
92} 105}
93 106
94/* 107static struct notifier_block mce_dec_nb = {
95 * CPU/chipset specific EDAC code can register a callback here to print 108 .notifier_call = default_decode_mce,
96 * MCE errors in a human-readable form: 109 .priority = -1,
97 */ 110};
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
100 111
101/* MCA banks polled by the period polling timer for corrected events */ 112/* MCA banks polled by the period polling timer for corrected events */
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 113DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,6 +152,9 @@ void mce_log(struct mce *mce)
141{ 152{
142 unsigned next, entry; 153 unsigned next, entry;
143 154
155 /* Emit the trace record: */
156 trace_mce_record(mce);
157
144 mce->finished = 0; 158 mce->finished = 0;
145 wmb(); 159 wmb();
146 for (;;) { 160 for (;;) {
@@ -204,9 +218,9 @@ static void print_mce(struct mce *m)
204 218
205 /* 219 /*
206 * Print out human-readable details about the MCE error, 220 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that): 221 * (if the CPU has an implementation for that)
208 */ 222 */
209 x86_mce_decode_callback(m); 223 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
210} 224}
211 225
212static void print_mce_head(void) 226static void print_mce_head(void)
@@ -1122,7 +1136,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1136static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1123static DEFINE_PER_CPU(struct timer_list, mce_timer); 1137static DEFINE_PER_CPU(struct timer_list, mce_timer);
1124 1138
1125static void mcheck_timer(unsigned long data) 1139static void mce_start_timer(unsigned long data)
1126{ 1140{
1127 struct timer_list *t = &per_cpu(mce_timer, data); 1141 struct timer_list *t = &per_cpu(mce_timer, data);
1128 int *n; 1142 int *n;
@@ -1187,7 +1201,7 @@ int mce_notify_irq(void)
1187} 1201}
1188EXPORT_SYMBOL_GPL(mce_notify_irq); 1202EXPORT_SYMBOL_GPL(mce_notify_irq);
1189 1203
1190static int mce_banks_init(void) 1204static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1191{ 1205{
1192 int i; 1206 int i;
1193 1207
@@ -1206,7 +1220,7 @@ static int mce_banks_init(void)
1206/* 1220/*
1207 * Initialize Machine Checks for a CPU. 1221 * Initialize Machine Checks for a CPU.
1208 */ 1222 */
1209static int __cpuinit mce_cap_init(void) 1223static int __cpuinit __mcheck_cpu_cap_init(void)
1210{ 1224{
1211 unsigned b; 1225 unsigned b;
1212 u64 cap; 1226 u64 cap;
@@ -1228,7 +1242,7 @@ static int __cpuinit mce_cap_init(void)
1228 WARN_ON(banks != 0 && b != banks); 1242 WARN_ON(banks != 0 && b != banks);
1229 banks = b; 1243 banks = b;
1230 if (!mce_banks) { 1244 if (!mce_banks) {
1231 int err = mce_banks_init(); 1245 int err = __mcheck_cpu_mce_banks_init();
1232 1246
1233 if (err) 1247 if (err)
1234 return err; 1248 return err;
@@ -1244,7 +1258,7 @@ static int __cpuinit mce_cap_init(void)
1244 return 0; 1258 return 0;
1245} 1259}
1246 1260
1247static void mce_init(void) 1261static void __mcheck_cpu_init_generic(void)
1248{ 1262{
1249 mce_banks_t all_banks; 1263 mce_banks_t all_banks;
1250 u64 cap; 1264 u64 cap;
@@ -1273,7 +1287,7 @@ static void mce_init(void)
1273} 1287}
1274 1288
1275/* Add per CPU specific workarounds here */ 1289/* Add per CPU specific workarounds here */
1276static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1290static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1277{ 1291{
1278 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1292 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1279 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1293 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1341,7 +1355,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1341 return 0; 1355 return 0;
1342} 1356}
1343 1357
1344static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1358static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1345{ 1359{
1346 if (c->x86 != 5) 1360 if (c->x86 != 5)
1347 return; 1361 return;
@@ -1355,7 +1369,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1355 } 1369 }
1356} 1370}
1357 1371
1358static void mce_cpu_features(struct cpuinfo_x86 *c) 1372static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1359{ 1373{
1360 switch (c->x86_vendor) { 1374 switch (c->x86_vendor) {
1361 case X86_VENDOR_INTEL: 1375 case X86_VENDOR_INTEL:
@@ -1369,7 +1383,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1369 } 1383 }
1370} 1384}
1371 1385
1372static void mce_init_timer(void) 1386static void __mcheck_cpu_init_timer(void)
1373{ 1387{
1374 struct timer_list *t = &__get_cpu_var(mce_timer); 1388 struct timer_list *t = &__get_cpu_var(mce_timer);
1375 int *n = &__get_cpu_var(mce_next_interval); 1389 int *n = &__get_cpu_var(mce_next_interval);
@@ -1380,7 +1394,7 @@ static void mce_init_timer(void)
1380 *n = check_interval * HZ; 1394 *n = check_interval * HZ;
1381 if (!*n) 1395 if (!*n)
1382 return; 1396 return;
1383 setup_timer(t, mcheck_timer, smp_processor_id()); 1397 setup_timer(t, mce_start_timer, smp_processor_id());
1384 t->expires = round_jiffies(jiffies + *n); 1398 t->expires = round_jiffies(jiffies + *n);
1385 add_timer_on(t, smp_processor_id()); 1399 add_timer_on(t, smp_processor_id());
1386} 1400}
@@ -1400,27 +1414,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1400 * Called for each booted CPU to set up machine checks. 1414 * Called for each booted CPU to set up machine checks.
1401 * Must be called with preempt off: 1415 * Must be called with preempt off:
1402 */ 1416 */
1403void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1417void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1404{ 1418{
1405 if (mce_disabled) 1419 if (mce_disabled)
1406 return; 1420 return;
1407 1421
1408 mce_ancient_init(c); 1422 __mcheck_cpu_ancient_init(c);
1409 1423
1410 if (!mce_available(c)) 1424 if (!mce_available(c))
1411 return; 1425 return;
1412 1426
1413 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1427 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1414 mce_disabled = 1; 1428 mce_disabled = 1;
1415 return; 1429 return;
1416 } 1430 }
1417 1431
1418 machine_check_vector = do_machine_check; 1432 machine_check_vector = do_machine_check;
1419 1433
1420 mce_init(); 1434 __mcheck_cpu_init_generic();
1421 mce_cpu_features(c); 1435 __mcheck_cpu_init_vendor(c);
1422 mce_init_timer(); 1436 __mcheck_cpu_init_timer();
1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1437 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1438
1424} 1439}
1425 1440
1426/* 1441/*
@@ -1640,6 +1655,15 @@ static int __init mcheck_enable(char *str)
1640} 1655}
1641__setup("mce", mcheck_enable); 1656__setup("mce", mcheck_enable);
1642 1657
1658int __init mcheck_init(void)
1659{
1660 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1661
1662 mcheck_intel_therm_init();
1663
1664 return 0;
1665}
1666
1643/* 1667/*
1644 * Sysfs support 1668 * Sysfs support
1645 */ 1669 */
@@ -1648,7 +1672,7 @@ __setup("mce", mcheck_enable);
1648 * Disable machine checks on suspend and shutdown. We can't really handle 1672 * Disable machine checks on suspend and shutdown. We can't really handle
1649 * them later. 1673 * them later.
1650 */ 1674 */
1651static int mce_disable(void) 1675static int mce_disable_error_reporting(void)
1652{ 1676{
1653 int i; 1677 int i;
1654 1678
@@ -1663,12 +1687,12 @@ static int mce_disable(void)
1663 1687
1664static int mce_suspend(struct sys_device *dev, pm_message_t state) 1688static int mce_suspend(struct sys_device *dev, pm_message_t state)
1665{ 1689{
1666 return mce_disable(); 1690 return mce_disable_error_reporting();
1667} 1691}
1668 1692
1669static int mce_shutdown(struct sys_device *dev) 1693static int mce_shutdown(struct sys_device *dev)
1670{ 1694{
1671 return mce_disable(); 1695 return mce_disable_error_reporting();
1672} 1696}
1673 1697
1674/* 1698/*
@@ -1678,8 +1702,8 @@ static int mce_shutdown(struct sys_device *dev)
1678 */ 1702 */
1679static int mce_resume(struct sys_device *dev) 1703static int mce_resume(struct sys_device *dev)
1680{ 1704{
1681 mce_init(); 1705 __mcheck_cpu_init_generic();
1682 mce_cpu_features(&current_cpu_data); 1706 __mcheck_cpu_init_vendor(&current_cpu_data);
1683 1707
1684 return 0; 1708 return 0;
1685} 1709}
@@ -1689,8 +1713,8 @@ static void mce_cpu_restart(void *data)
1689 del_timer_sync(&__get_cpu_var(mce_timer)); 1713 del_timer_sync(&__get_cpu_var(mce_timer));
1690 if (!mce_available(&current_cpu_data)) 1714 if (!mce_available(&current_cpu_data))
1691 return; 1715 return;
1692 mce_init(); 1716 __mcheck_cpu_init_generic();
1693 mce_init_timer(); 1717 __mcheck_cpu_init_timer();
1694} 1718}
1695 1719
1696/* Reinit MCEs after user configuration changes */ 1720/* Reinit MCEs after user configuration changes */
@@ -1716,7 +1740,7 @@ static void mce_enable_ce(void *all)
1716 cmci_reenable(); 1740 cmci_reenable();
1717 cmci_recheck(); 1741 cmci_recheck();
1718 if (all) 1742 if (all)
1719 mce_init_timer(); 1743 __mcheck_cpu_init_timer();
1720} 1744}
1721 1745
1722static struct sysdev_class mce_sysclass = { 1746static struct sysdev_class mce_sysclass = {
@@ -1929,13 +1953,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1929} 1953}
1930 1954
1931/* Make sure there are no machine checks on offlined CPUs. */ 1955/* Make sure there are no machine checks on offlined CPUs. */
1932static void mce_disable_cpu(void *h) 1956static void __cpuinit mce_disable_cpu(void *h)
1933{ 1957{
1934 unsigned long action = *(unsigned long *)h; 1958 unsigned long action = *(unsigned long *)h;
1935 int i; 1959 int i;
1936 1960
1937 if (!mce_available(&current_cpu_data)) 1961 if (!mce_available(&current_cpu_data))
1938 return; 1962 return;
1963
1939 if (!(action & CPU_TASKS_FROZEN)) 1964 if (!(action & CPU_TASKS_FROZEN))
1940 cmci_clear(); 1965 cmci_clear();
1941 for (i = 0; i < banks; i++) { 1966 for (i = 0; i < banks; i++) {
@@ -1946,7 +1971,7 @@ static void mce_disable_cpu(void *h)
1946 } 1971 }
1947} 1972}
1948 1973
1949static void mce_reenable_cpu(void *h) 1974static void __cpuinit mce_reenable_cpu(void *h)
1950{ 1975{
1951 unsigned long action = *(unsigned long *)h; 1976 unsigned long action = *(unsigned long *)h;
1952 int i; 1977 int i;
@@ -2027,7 +2052,7 @@ static __init void mce_init_banks(void)
2027 } 2052 }
2028} 2053}
2029 2054
2030static __init int mce_init_device(void) 2055static __init int mcheck_init_device(void)
2031{ 2056{
2032 int err; 2057 int err;
2033 int i = 0; 2058 int i = 0;
@@ -2055,7 +2080,7 @@ static __init int mce_init_device(void)
2055 return err; 2080 return err;
2056} 2081}
2057 2082
2058device_initcall(mce_init_device); 2083device_initcall(mcheck_init_device);
2059 2084
2060/* 2085/*
2061 * Old style boot options parsing. Only for compatibility. 2086 * Old style boot options parsing. Only for compatibility.
@@ -2103,7 +2128,7 @@ static int fake_panic_set(void *data, u64 val)
2103DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2128DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2104 fake_panic_set, "%llu\n"); 2129 fake_panic_set, "%llu\n");
2105 2130
2106static int __init mce_debugfs_init(void) 2131static int __init mcheck_debugfs_init(void)
2107{ 2132{
2108 struct dentry *dmce, *ffake_panic; 2133 struct dentry *dmce, *ffake_panic;
2109 2134
@@ -2117,5 +2142,5 @@ static int __init mce_debugfs_init(void)
2117 2142
2118 return 0; 2143 return 0;
2119} 2144}
2120late_initcall(mce_debugfs_init); 2145late_initcall(mcheck_debugfs_init);
2121#endif 2146#endif
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..4fef985fc221 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,6 +256,18 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259void __init mcheck_intel_therm_init(void)
260{
261 /*
262 * This function is only called on boot CPU. Save the init thermal
263 * LVT value on BSP and use that value to restore APs' thermal LVT
264 * entry BIOS programmed later
265 */
266 if (cpu_has(&boot_cpu_data, X86_FEATURE_ACPI) &&
267 cpu_has(&boot_cpu_data, X86_FEATURE_ACC))
268 lvtthmr_init = apic_read(APIC_LVTTHMR);
269}
270
257void intel_init_thermal(struct cpuinfo_x86 *c) 271void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 272{
259 unsigned int cpu = smp_processor_id(); 273 unsigned int cpu = smp_processor_id();
@@ -270,7 +284,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 284 * since it might be delivered via SMI already:
271 */ 285 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 286 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 287
288 /*
289 * The initial value of thermal LVT entries on all APs always reads
290 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
291 * sequence to them and LVT registers are reset to 0s except for
292 * the mask bits which are set to 1s when APs receive INIT IPI.
293 * Always restore the value that BIOS has programmed on AP based on
294 * BSP's info we saved since BIOS is always setting the same value
295 * for all threads/cores
296 */
297 apic_write(APIC_LVTTHMR, lvtthmr_init);
298
299 h = lvtthmr_init;
300
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 301 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 302 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 303 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..c1bbed1021d9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -77,6 +77,18 @@ struct cpu_hw_events {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -102,6 +114,8 @@ struct x86_pmu {
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
@@ -110,6 +124,8 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -155,6 +171,16 @@ static u64 p6_pmu_raw_event(u64 hw_event)
155 return hw_event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,6 +196,35 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
173static u64 intel_pmu_event_map(int hw_event) 228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[hw_event]; 230 return intel_perfmon_event_map[hw_event];
@@ -190,7 +245,7 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 245 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 246 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 247
193static const u64 nehalem_hw_cache_event_ids 248static __initconst u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX] 249 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX] 250 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 251 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -281,7 +336,7 @@ static const u64 nehalem_hw_cache_event_ids
281 }, 336 },
282}; 337};
283 338
284static const u64 core2_hw_cache_event_ids 339static __initconst u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX] 340 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX] 341 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 342 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -372,7 +427,7 @@ static const u64 core2_hw_cache_event_ids
372 }, 427 },
373}; 428};
374 429
375static const u64 atom_hw_cache_event_ids 430static __initconst u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX] 431 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX] 432 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 433 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -469,7 +524,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
@@ -481,7 +536,7 @@ static u64 intel_pmu_raw_event(u64 hw_event)
481 return hw_event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static __initconst u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX] 540 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX] 541 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 542 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -932,6 +987,8 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 987 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 989
990 hwc->idx = -1;
991
935 /* 992 /*
936 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
937 */ 994 */
@@ -1334,8 +1391,7 @@ static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1334 x86_pmu_enable_event(hwc, idx); 1391 x86_pmu_enable_event(hwc, idx);
1335} 1392}
1336 1393
1337static int 1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{ 1395{
1340 unsigned int hw_event; 1396 unsigned int hw_event;
1341 1397
@@ -1349,6 +1405,12 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1349 if (!x86_pmu.num_events_fixed) 1405 if (!x86_pmu.num_events_fixed)
1350 return -1; 1406 return -1;
1351 1407
1408 /*
1409 * fixed counters do not take all possible filters
1410 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1412 return -1;
1413
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@ -1360,22 +1422,57 @@ fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1360} 1422}
1361 1423
1362/* 1424/*
1363 * Find a PMC slot for the freshly enabled / scheduled in event: 1425 * generic counter allocator: get next free counter
1364 */ 1426 */
1365static int x86_pmu_enable(struct perf_event *event) 1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1429{
1430 int idx;
1431
1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435
1436/*
1437 * intel-specific counter allocator: check event constraints
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444
1445 if (!event_constraints)
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1465{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw;
1369 int idx; 1466 int idx;
1370 1467
1371 idx = fixed_mode_idx(event, hwc); 1468 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1470 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1471 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1472 return -EAGAIN;
1376 1473
1377 hwc->config_base = 0; 1474 hwc->config_base = 0;
1378 hwc->event_base = 0; 1475 hwc->event_base = 0;
1379 hwc->idx = idx; 1476 hwc->idx = idx;
1380 } else if (idx >= 0) { 1477 } else if (idx >= 0) {
1381 /* 1478 /*
@@ -1396,20 +1493,35 @@ static int x86_pmu_enable(struct perf_event *event)
1396 } else { 1493 } else {
1397 idx = hwc->idx; 1494 idx = hwc->idx;
1398 /* Try to get the previous generic event again */ 1495 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1497try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_events); 1499 if (idx == -1)
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN; 1500 return -EAGAIN;
1405 1501
1406 set_bit(idx, cpuc->used_mask); 1502 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1503 hwc->idx = idx;
1408 } 1504 }
1409 hwc->config_base = x86_pmu.eventsel; 1505 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr; 1506 hwc->event_base = x86_pmu.perfctr;
1411 } 1507 }
1412 1508
1509 return idx;
1510}
1511
1512/*
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1413 perf_events_lapic_init(); 1525 perf_events_lapic_init();
1414 1526
1415 x86_pmu.disable(hwc, idx); 1527 x86_pmu.disable(hwc, idx);
@@ -1852,7 +1964,7 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1964 .priority = 1
1853}; 1965};
1854 1966
1855static struct x86_pmu p6_pmu = { 1967static __initconst struct x86_pmu p6_pmu = {
1856 .name = "p6", 1968 .name = "p6",
1857 .handle_irq = p6_pmu_handle_irq, 1969 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all, 1970 .disable_all = p6_pmu_disable_all,
@@ -1877,9 +1989,10 @@ static struct x86_pmu p6_pmu = {
1877 */ 1989 */
1878 .event_bits = 32, 1990 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1, 1991 .event_mask = (1ULL << 32) - 1,
1992 .get_event_idx = intel_get_event_idx,
1880}; 1993};
1881 1994
1882static struct x86_pmu intel_pmu = { 1995static __initconst struct x86_pmu intel_pmu = {
1883 .name = "Intel", 1996 .name = "Intel",
1884 .handle_irq = intel_pmu_handle_irq, 1997 .handle_irq = intel_pmu_handle_irq,
1885 .disable_all = intel_pmu_disable_all, 1998 .disable_all = intel_pmu_disable_all,
@@ -1900,9 +2013,10 @@ static struct x86_pmu intel_pmu = {
1900 .max_period = (1ULL << 31) - 1, 2013 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts, 2014 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts, 2015 .disable_bts = intel_pmu_disable_bts,
2016 .get_event_idx = intel_get_event_idx,
1903}; 2017};
1904 2018
1905static struct x86_pmu amd_pmu = { 2019static __initconst struct x86_pmu amd_pmu = {
1906 .name = "AMD", 2020 .name = "AMD",
1907 .handle_irq = amd_pmu_handle_irq, 2021 .handle_irq = amd_pmu_handle_irq,
1908 .disable_all = amd_pmu_disable_all, 2022 .disable_all = amd_pmu_disable_all,
@@ -1920,9 +2034,10 @@ static struct x86_pmu amd_pmu = {
1920 .apic = 1, 2034 .apic = 1,
1921 /* use highest bit to detect overflow */ 2035 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1, 2036 .max_period = (1ULL << 47) - 1,
2037 .get_event_idx = gen_get_event_idx,
1923}; 2038};
1924 2039
1925static int p6_pmu_init(void) 2040static __init int p6_pmu_init(void)
1926{ 2041{
1927 switch (boot_cpu_data.x86_model) { 2042 switch (boot_cpu_data.x86_model) {
1928 case 1: 2043 case 1:
@@ -1932,10 +2047,12 @@ static int p6_pmu_init(void)
1932 case 7: 2047 case 7:
1933 case 8: 2048 case 8:
1934 case 11: /* Pentium III */ 2049 case 11: /* Pentium III */
2050 event_constraints = intel_p6_event_constraints;
1935 break; 2051 break;
1936 case 9: 2052 case 9:
1937 case 13: 2053 case 13:
1938 /* Pentium M */ 2054 /* Pentium M */
2055 event_constraints = intel_p6_event_constraints;
1939 break; 2056 break;
1940 default: 2057 default:
1941 pr_cont("unsupported p6 CPU model %d ", 2058 pr_cont("unsupported p6 CPU model %d ",
@@ -1954,7 +2071,7 @@ static int p6_pmu_init(void)
1954 return 0; 2071 return 0;
1955} 2072}
1956 2073
1957static int intel_pmu_init(void) 2074static __init int intel_pmu_init(void)
1958{ 2075{
1959 union cpuid10_edx edx; 2076 union cpuid10_edx edx;
1960 union cpuid10_eax eax; 2077 union cpuid10_eax eax;
@@ -2007,12 +2124,14 @@ static int intel_pmu_init(void)
2007 sizeof(hw_cache_event_ids)); 2124 sizeof(hw_cache_event_ids));
2008 2125
2009 pr_cont("Core2 events, "); 2126 pr_cont("Core2 events, ");
2127 event_constraints = intel_core_event_constraints;
2010 break; 2128 break;
2011 default: 2129 default:
2012 case 26: 2130 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2131 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids)); 2132 sizeof(hw_cache_event_ids));
2015 2133
2134 event_constraints = intel_nehalem_event_constraints;
2016 pr_cont("Nehalem/Corei7 events, "); 2135 pr_cont("Nehalem/Corei7 events, ");
2017 break; 2136 break;
2018 case 28: 2137 case 28:
@@ -2025,7 +2144,7 @@ static int intel_pmu_init(void)
2025 return 0; 2144 return 0;
2026} 2145}
2027 2146
2028static int amd_pmu_init(void) 2147static __init int amd_pmu_init(void)
2029{ 2148{
2030 /* Performance-monitoring supported from K7 and later: */ 2149 /* Performance-monitoring supported from K7 and later: */
2031 if (boot_cpu_data.x86 < 6) 2150 if (boot_cpu_data.x86 < 6)
@@ -2105,11 +2224,47 @@ static const struct pmu pmu = {
2105 .unthrottle = x86_pmu_unthrottle, 2224 .unthrottle = x86_pmu_unthrottle,
2106}; 2225};
2107 2226
2227static int
2228validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2229{
2230 struct hw_perf_event fake_event = event->hw;
2231
2232 if (event->pmu && event->pmu != &pmu)
2233 return 0;
2234
2235 return x86_schedule_event(cpuc, &fake_event) >= 0;
2236}
2237
2238static int validate_group(struct perf_event *event)
2239{
2240 struct perf_event *sibling, *leader = event->group_leader;
2241 struct cpu_hw_events fake_pmu;
2242
2243 memset(&fake_pmu, 0, sizeof(fake_pmu));
2244
2245 if (!validate_event(&fake_pmu, leader))
2246 return -ENOSPC;
2247
2248 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2249 if (!validate_event(&fake_pmu, sibling))
2250 return -ENOSPC;
2251 }
2252
2253 if (!validate_event(&fake_pmu, event))
2254 return -ENOSPC;
2255
2256 return 0;
2257}
2258
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 2259const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 2260{
2110 int err; 2261 int err;
2111 2262
2112 err = __hw_perf_event_init(event); 2263 err = __hw_perf_event_init(event);
2264 if (!err) {
2265 if (event->group_leader != event)
2266 err = validate_group(event);
2267 }
2113 if (err) { 2268 if (err) {
2114 if (event->destroy) 2269 if (event->destroy)
2115 event->destroy(event); 2270 event->destroy(event);
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..898df9719afb 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -712,7 +712,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 712 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 713 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 715 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 716 return;
717 wd_ops = &k7_wd_ops; 717 wd_ops = &k7_wd_ops;
718 break; 718 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..7ef24a796992 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -116,21 +116,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 116{
117 unsigned int cpu; 117 unsigned int cpu;
118 struct cpuinfo_x86 *c; 118 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 119
123 cpu = iminor(file->f_path.dentry->d_inode); 120 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 121 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 122 return -ENXIO; /* No such CPU */
126 goto out; 123
127 }
128 c = &cpu_data(cpu); 124 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 125 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 126 return -EIO; /* CPUID not supported */
131out: 127
132 unlock_kernel(); 128 return 0;
133 return ret;
134} 129}
135 130
136/* 131/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..a4849c10a77e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,7 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h> 30#include <asm/x86_init.h>
31
32 31
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 32#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 33
@@ -106,7 +105,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
106#endif 105#endif
107 106
108#ifdef CONFIG_X86_64 107#ifdef CONFIG_X86_64
109 pci_iommu_shutdown(); 108 x86_platform.iommu_shutdown();
110#endif 109#endif
111 110
112 crash_save_cpu(regs, safe_smp_processor_id()); 111 crash_save_cpu(regs, safe_smp_processor_id());
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..b8ce165dde5d 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -268,11 +268,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 268
269 show_registers(regs); 269 show_registers(regs);
270#ifdef CONFIG_X86_32 270#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 271 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 272 sp = regs->sp;
275 ss = regs->ss & 0xffff; 273 ss = regs->ss & 0xffff;
274 } else {
275 sp = kernel_stack_pointer(regs);
276 savesegment(ss, ss);
276 } 277 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 278 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 279 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..e0ed4c7abb62 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,9 +10,9 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
@@ -35,6 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 35
36 if (!stack) { 36 if (!stack) {
37 unsigned long dummy; 37 unsigned long dummy;
38
38 stack = &dummy; 39 stack = &dummy;
39 if (task && task != current) 40 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 41 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +58,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 58
58 context = (struct thread_info *) 59 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 60 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 61 bp = print_context_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 62
63 stack = (unsigned long *)context->previous_esp; 63 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 64 if (!stack)
@@ -72,7 +72,7 @@ EXPORT_SYMBOL(dump_trace);
72 72
73void 73void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 75 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 76{
77 unsigned long *stack; 77 unsigned long *stack;
78 int i; 78 int i;
@@ -156,4 +156,3 @@ int is_valid_bugaddr(unsigned long ip)
156 156
157 return ud2 == 0x0b0f; 157 return ud2 == 0x0b0f;
158} 158}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..8e740934bd1f 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,26 +10,28 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33 35
34int x86_is_stack_id(int id, char *name) 36int x86_is_stack_id(int id, char *name)
35{ 37{
@@ -37,7 +39,7 @@ int x86_is_stack_id(int id, char *name)
37} 39}
38 40
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 41static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 42 unsigned *usedp, char **idp)
41{ 43{
42 unsigned k; 44 unsigned k;
43 45
@@ -202,21 +204,24 @@ EXPORT_SYMBOL(dump_trace);
202 204
203void 205void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 206show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 207 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 208{
209 unsigned long *irq_stack_end;
210 unsigned long *irq_stack;
207 unsigned long *stack; 211 unsigned long *stack;
212 int cpu;
208 int i; 213 int i;
209 const int cpu = smp_processor_id(); 214
210 unsigned long *irq_stack_end = 215 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 216 cpu = smp_processor_id();
212 unsigned long *irq_stack = 217
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 218 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
219 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 220
215 /* 221 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 222 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 223 * back trace for this cpu:
218 */ 224 */
219
220 if (sp == NULL) { 225 if (sp == NULL) {
221 if (task) 226 if (task)
222 sp = (unsigned long *)task->thread.sp; 227 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +245,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 245 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 246 touch_nmi_watchdog();
242 } 247 }
248 preempt_enable();
249
243 printk("\n"); 250 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 251 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 252}
@@ -303,4 +310,3 @@ int is_valid_bugaddr(unsigned long ip)
303 310
304 return ud2 == 0x0b0f; 311 return ud2 == 0x0b0f;
305} 312}
306
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..50b9c220e121 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,6 +717,10 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
@@ -814,6 +830,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 830ENDPROC(common_interrupt)
815 CFI_ENDPROC 831 CFI_ENDPROC
816 832
833/*
834 * Irq entries should be protected against kprobes
835 */
836 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 837#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 838ENTRY(name) \
819 RING0_INT_FRAME; \ 839 RING0_INT_FRAME; \
@@ -980,6 +1000,10 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1000 jmp error_code
981 CFI_ENDPROC 1001 CFI_ENDPROC
982END(spurious_interrupt_bug) 1002END(spurious_interrupt_bug)
1003/*
1004 * End of kprobes section
1005 */
1006 .popsection
983 1007
984ENTRY(kernel_thread_helper) 1008ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1009 pushl $0 # fake return address for unwinder
@@ -1185,17 +1209,14 @@ END(ftrace_graph_caller)
1185 1209
1186.globl return_to_handler 1210.globl return_to_handler
1187return_to_handler: 1211return_to_handler:
1188 pushl $0
1189 pushl %eax 1212 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1213 pushl %edx
1192 movl %ebp, %eax 1214 movl %ebp, %eax
1193 call ftrace_return_to_handler 1215 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1216 movl %eax, %ecx
1195 popl %edx 1217 popl %edx
1196 popl %ecx
1197 popl %eax 1218 popl %eax
1198 ret 1219 jmp *%ecx
1199#endif 1220#endif
1200 1221
1201.section .rodata,"a" 1222.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b5c061f8f358..4deb8fc849dd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -1491,12 +1499,17 @@ error_kernelspace:
1491 leaq irq_return(%rip),%rcx 1499 leaq irq_return(%rip),%rcx
1492 cmpq %rcx,RIP+8(%rsp) 1500 cmpq %rcx,RIP+8(%rsp)
1493 je error_swapgs 1501 je error_swapgs
1494 movl %ecx,%ecx /* zero extend */ 1502 movl %ecx,%eax /* zero extend */
1495 cmpq %rcx,RIP+8(%rsp) 1503 cmpq %rax,RIP+8(%rsp)
1496 je error_swapgs 1504 je bstep_iret
1497 cmpq $gs_change,RIP+8(%rsp) 1505 cmpq $gs_change,RIP+8(%rsp)
1498 je error_swapgs 1506 je error_swapgs
1499 jmp error_sti 1507 jmp error_sti
1508
1509bstep_iret:
1510 /* Fix truncated RIP */
1511 movq %rcx,RIP+8(%rsp)
1512 jmp error_swapgs
1500END(error_entry) 1513END(error_entry)
1501 1514
1502 1515
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..5a1b9758fd62 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 338
337 switch (faulted) { 339 switch (faulted) {
338 case 0: 340 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 341 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 342 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 343 break;
342 case 1: 344 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 345 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 346 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 347 break;
346 case 2: 348 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 349 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 350 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 351 break;
350 } 352 }
@@ -468,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
468 470
469#ifdef CONFIG_FTRACE_SYSCALLS 471#ifdef CONFIG_FTRACE_SYSCALLS
470 472
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table; 473extern unsigned long *sys_call_table;
474 474
475static struct syscall_metadata **syscalls_metadata; 475unsigned long __init arch_syscall_addr(int nr)
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{ 476{
505 int i; 477 return (unsigned long)(&sys_call_table)[nr];
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547} 478}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif 479#endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd928fcd5..22db86a37643 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..d42f65ac4927
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,555 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215/*
216 * Store a breakpoint's encoded address, length, and type.
217 */
218static int arch_store_info(struct perf_event *bp)
219{
220 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
221 /*
222 * For kernel-addresses, either the address or symbol name can be
223 * specified.
224 */
225 if (info->name)
226 info->address = (unsigned long)
227 kallsyms_lookup_name(info->name);
228 if (info->address)
229 return 0;
230
231 return -EINVAL;
232}
233
234int arch_bp_generic_fields(int x86_len, int x86_type,
235 int *gen_len, int *gen_type)
236{
237 /* Len */
238 switch (x86_len) {
239 case X86_BREAKPOINT_LEN_1:
240 *gen_len = HW_BREAKPOINT_LEN_1;
241 break;
242 case X86_BREAKPOINT_LEN_2:
243 *gen_len = HW_BREAKPOINT_LEN_2;
244 break;
245 case X86_BREAKPOINT_LEN_4:
246 *gen_len = HW_BREAKPOINT_LEN_4;
247 break;
248#ifdef CONFIG_X86_64
249 case X86_BREAKPOINT_LEN_8:
250 *gen_len = HW_BREAKPOINT_LEN_8;
251 break;
252#endif
253 default:
254 return -EINVAL;
255 }
256
257 /* Type */
258 switch (x86_type) {
259 case X86_BREAKPOINT_EXECUTE:
260 *gen_type = HW_BREAKPOINT_X;
261 break;
262 case X86_BREAKPOINT_WRITE:
263 *gen_type = HW_BREAKPOINT_W;
264 break;
265 case X86_BREAKPOINT_RW:
266 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
267 break;
268 default:
269 return -EINVAL;
270 }
271
272 return 0;
273}
274
275
276static int arch_build_bp_info(struct perf_event *bp)
277{
278 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
279
280 info->address = bp->attr.bp_addr;
281
282 /* Len */
283 switch (bp->attr.bp_len) {
284 case HW_BREAKPOINT_LEN_1:
285 info->len = X86_BREAKPOINT_LEN_1;
286 break;
287 case HW_BREAKPOINT_LEN_2:
288 info->len = X86_BREAKPOINT_LEN_2;
289 break;
290 case HW_BREAKPOINT_LEN_4:
291 info->len = X86_BREAKPOINT_LEN_4;
292 break;
293#ifdef CONFIG_X86_64
294 case HW_BREAKPOINT_LEN_8:
295 info->len = X86_BREAKPOINT_LEN_8;
296 break;
297#endif
298 default:
299 return -EINVAL;
300 }
301
302 /* Type */
303 switch (bp->attr.bp_type) {
304 case HW_BREAKPOINT_W:
305 info->type = X86_BREAKPOINT_WRITE;
306 break;
307 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
308 info->type = X86_BREAKPOINT_RW;
309 break;
310 case HW_BREAKPOINT_X:
311 info->type = X86_BREAKPOINT_EXECUTE;
312 break;
313 default:
314 return -EINVAL;
315 }
316
317 return 0;
318}
319/*
320 * Validate the arch-specific HW Breakpoint register settings
321 */
322int arch_validate_hwbkpt_settings(struct perf_event *bp,
323 struct task_struct *tsk)
324{
325 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
326 unsigned int align;
327 int ret;
328
329
330 ret = arch_build_bp_info(bp);
331 if (ret)
332 return ret;
333
334 ret = -EINVAL;
335
336 if (info->type == X86_BREAKPOINT_EXECUTE)
337 /*
338 * Ptrace-refactoring code
339 * For now, we'll allow instruction breakpoint only for user-space
340 * addresses
341 */
342 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
343 info->len != X86_BREAKPOINT_EXECUTE)
344 return ret;
345
346 switch (info->len) {
347 case X86_BREAKPOINT_LEN_1:
348 align = 0;
349 break;
350 case X86_BREAKPOINT_LEN_2:
351 align = 1;
352 break;
353 case X86_BREAKPOINT_LEN_4:
354 align = 3;
355 break;
356#ifdef CONFIG_X86_64
357 case X86_BREAKPOINT_LEN_8:
358 align = 7;
359 break;
360#endif
361 default:
362 return ret;
363 }
364
365 if (bp->callback)
366 ret = arch_store_info(bp);
367
368 if (ret < 0)
369 return ret;
370 /*
371 * Check that the low-order bits of the address are appropriate
372 * for the alignment implied by len.
373 */
374 if (info->address & align)
375 return -EINVAL;
376
377 /* Check that the virtual address is in the proper range */
378 if (tsk) {
379 if (!arch_check_va_in_userspace(info->address, info->len))
380 return -EFAULT;
381 } else {
382 if (!arch_check_va_in_kernelspace(info->address, info->len))
383 return -EFAULT;
384 }
385
386 return 0;
387}
388
389/*
390 * Dump the debug register contents to the user.
391 * We can't dump our per cpu values because it
392 * may contain cpu wide breakpoint, something that
393 * doesn't belong to the current task.
394 *
395 * TODO: include non-ptrace user breakpoints (perf)
396 */
397void aout_dump_debugregs(struct user *dump)
398{
399 int i;
400 int dr7 = 0;
401 struct perf_event *bp;
402 struct arch_hw_breakpoint *info;
403 struct thread_struct *thread = &current->thread;
404
405 for (i = 0; i < HBP_NUM; i++) {
406 bp = thread->ptrace_bps[i];
407
408 if (bp && !bp->attr.disabled) {
409 dump->u_debugreg[i] = bp->attr.bp_addr;
410 info = counter_arch_bp(bp);
411 dr7 |= encode_dr7(i, info->len, info->type);
412 } else {
413 dump->u_debugreg[i] = 0;
414 }
415 }
416
417 dump->u_debugreg[4] = 0;
418 dump->u_debugreg[5] = 0;
419 dump->u_debugreg[6] = current->thread.debugreg6;
420
421 dump->u_debugreg[7] = dr7;
422}
423EXPORT_SYMBOL_GPL(aout_dump_debugregs);
424
425/*
426 * Release the user breakpoints used by ptrace
427 */
428void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
429{
430 int i;
431 struct thread_struct *t = &tsk->thread;
432
433 for (i = 0; i < HBP_NUM; i++) {
434 unregister_hw_breakpoint(t->ptrace_bps[i]);
435 t->ptrace_bps[i] = NULL;
436 }
437}
438
439void hw_breakpoint_restore(void)
440{
441 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
442 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
443 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
444 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
445 set_debugreg(current->thread.debugreg6, 6);
446 set_debugreg(__get_cpu_var(cpu_dr7), 7);
447}
448EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
449
450/*
451 * Handle debug exception notifications.
452 *
453 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
454 *
455 * NOTIFY_DONE returned if one of the following conditions is true.
456 * i) When the causative address is from user-space and the exception
457 * is a valid one, i.e. not triggered as a result of lazy debug register
458 * switching
459 * ii) When there are more bits than trap<n> set in DR6 register (such
460 * as BD, BS or BT) indicating that more than one debug condition is
461 * met and requires some more action in do_debug().
462 *
463 * NOTIFY_STOP returned for all other cases
464 *
465 */
466static int __kprobes hw_breakpoint_handler(struct die_args *args)
467{
468 int i, cpu, rc = NOTIFY_STOP;
469 struct perf_event *bp;
470 unsigned long dr7, dr6;
471 unsigned long *dr6_p;
472
473 /* The DR6 value is pointed by args->err */
474 dr6_p = (unsigned long *)ERR_PTR(args->err);
475 dr6 = *dr6_p;
476
477 /* Do an early return if no trap bits are set in DR6 */
478 if ((dr6 & DR_TRAP_BITS) == 0)
479 return NOTIFY_DONE;
480
481 get_debugreg(dr7, 7);
482 /* Disable breakpoints during exception handling */
483 set_debugreg(0UL, 7);
484 /*
485 * Assert that local interrupts are disabled
486 * Reset the DRn bits in the virtualized register value.
487 * The ptrace trigger routine will add in whatever is needed.
488 */
489 current->thread.debugreg6 &= ~DR_TRAP_BITS;
490 cpu = get_cpu();
491
492 /* Handle all the breakpoints that were triggered */
493 for (i = 0; i < HBP_NUM; ++i) {
494 if (likely(!(dr6 & (DR_TRAP0 << i))))
495 continue;
496
497 /*
498 * The counter may be concurrently released but that can only
499 * occur from a call_rcu() path. We can then safely fetch
500 * the breakpoint, use its callback, touch its counter
501 * while we are in an rcu_read_lock() path.
502 */
503 rcu_read_lock();
504
505 bp = per_cpu(bp_per_reg[i], cpu);
506 if (bp)
507 rc = NOTIFY_DONE;
508 /*
509 * Reset the 'i'th TRAP bit in dr6 to denote completion of
510 * exception handling
511 */
512 (*dr6_p) &= ~(DR_TRAP0 << i);
513 /*
514 * bp can be NULL due to lazy debug register switching
515 * or due to concurrent perf counter removing.
516 */
517 if (!bp) {
518 rcu_read_unlock();
519 break;
520 }
521
522 (bp->callback)(bp, args->regs);
523
524 rcu_read_unlock();
525 }
526 if (dr6 & (~DR_TRAP_BITS))
527 rc = NOTIFY_DONE;
528
529 set_debugreg(dr7, 7);
530 put_cpu();
531
532 return rc;
533}
534
535/*
536 * Handle debug exception notifications.
537 */
538int __kprobes hw_breakpoint_exceptions_notify(
539 struct notifier_block *unused, unsigned long val, void *data)
540{
541 if (val != DIE_DEBUG)
542 return NOTIFY_DONE;
543
544 return hw_breakpoint_handler(data);
545}
546
547void hw_breakpoint_pmu_read(struct perf_event *bp)
548{
549 /* TODO */
550}
551
552void hw_breakpoint_pmu_unthrottle(struct perf_event *bp)
553{
554 /* TODO */
555}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 04bbd5278568..fee6cc2b2079 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -194,11 +194,11 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
274} 274}
275 275
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..20a5b3689463 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,6 +43,7 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45 45
46#include <asm/debugreg.h>
46#include <asm/apicdef.h> 47#include <asm/apicdef.h>
47#include <asm/system.h> 48#include <asm/system.h>
48 49
@@ -88,7 +89,6 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
88 gdb_regs[GDB_SS] = __KERNEL_DS; 89 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 90 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 91 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp;
92#else 92#else
93 gdb_regs[GDB_R8] = regs->r8; 93 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 94 gdb_regs[GDB_R9] = regs->r9;
@@ -101,8 +101,8 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 101 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 102 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 103 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp;
105#endif 104#endif
105 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
106} 106}
107 107
108/** 108/**
@@ -434,6 +434,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 434 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 435 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 436 args->err, "c", "", regs);
437 /*
438 * Reset the BS bit in dr6 (pointed by args->err) to
439 * denote completion of processing
440 */
441 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 442
438 return NOTIFY_STOP; 443 return NOTIFY_STOP;
439} 444}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..1f3186ce213c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,22 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
51 52
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/desc.h> 54#include <asm/desc.h>
54#include <asm/pgtable.h> 55#include <asm/pgtable.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
56#include <asm/alternative.h> 57#include <asm/alternative.h>
58#include <asm/insn.h>
59#include <asm/debugreg.h>
57 60
58void jprobe_return_end(void); 61void jprobe_return_end(void);
59 62
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 63DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 64DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 65
63#ifdef CONFIG_X86_64 66#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 67
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 68#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 97 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 99};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 100#undef W
154 101
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 102struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -244,6 +191,75 @@ retry:
244 } 191 }
245} 192}
246 193
194/* Recover the probed instruction at addr for further analysis. */
195static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
196{
197 struct kprobe *kp;
198 kp = get_kprobe((void *)addr);
199 if (!kp)
200 return -EINVAL;
201
202 /*
203 * Basically, kp->ainsn.insn has an original instruction.
204 * However, RIP-relative instruction can not do single-stepping
205 * at different place, fix_riprel() tweaks the displacement of
206 * that instruction. In that case, we can't recover the instruction
207 * from the kp->ainsn.insn.
208 *
209 * On the other hand, kp->opcode has a copy of the first byte of
210 * the probed instruction, which is overwritten by int3. And
211 * the instruction at kp->addr is not modified by kprobes except
212 * for the first byte, we can recover the original instruction
213 * from it and kp->opcode.
214 */
215 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
216 buf[0] = kp->opcode;
217 return 0;
218}
219
220/* Dummy buffers for kallsyms_lookup */
221static char __dummy_buf[KSYM_NAME_LEN];
222
223/* Check if paddr is at an instruction boundary */
224static int __kprobes can_probe(unsigned long paddr)
225{
226 int ret;
227 unsigned long addr, offset = 0;
228 struct insn insn;
229 kprobe_opcode_t buf[MAX_INSN_SIZE];
230
231 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
232 return 0;
233
234 /* Decode instructions */
235 addr = paddr - offset;
236 while (addr < paddr) {
237 kernel_insn_init(&insn, (void *)addr);
238 insn_get_opcode(&insn);
239
240 /*
241 * Check if the instruction has been modified by another
242 * kprobe, in which case we replace the breakpoint by the
243 * original instruction in our buffer.
244 */
245 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
246 ret = recover_probed_instruction(buf, addr);
247 if (ret)
248 /*
249 * Another debugging subsystem might insert
250 * this breakpoint. In that case, we can't
251 * recover it.
252 */
253 return 0;
254 kernel_insn_init(&insn, buf);
255 }
256 insn_get_length(&insn);
257 addr += insn.length;
258 }
259
260 return (addr == paddr);
261}
262
247/* 263/*
248 * Returns non-zero if opcode modifies the interrupt flag. 264 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 265 */
@@ -277,68 +293,30 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
277static void __kprobes fix_riprel(struct kprobe *p) 293static void __kprobes fix_riprel(struct kprobe *p)
278{ 294{
279#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
280 u8 *insn = p->ainsn.insn; 296 struct insn insn;
281 s64 disp; 297 kernel_insn_init(&insn, p->ainsn.insn);
282 int need_modrm;
283
284 /* Skip legacy instruction prefixes. */
285 while (1) {
286 switch (*insn) {
287 case 0x66:
288 case 0x67:
289 case 0x2e:
290 case 0x3e:
291 case 0x26:
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 }
301 break;
302 }
303 298
304 /* Skip REX instruction prefix. */ 299 if (insn_rip_relative(&insn)) {
305 if (is_REX_prefix(insn)) 300 s64 newdisp;
306 ++insn; 301 u8 *disp;
307 302 insn_get_displacement(&insn);
308 if (*insn == 0x0f) { 303 /*
309 /* Two-byte opcode. */ 304 * The copied instruction uses the %rip-relative addressing
310 ++insn; 305 * mode. Adjust the displacement for the difference between
311 need_modrm = test_bit(*insn, 306 * the original location of this instruction and the location
312 (unsigned long *)twobyte_has_modrm); 307 * of the copy that will actually be run. The tricky bit here
313 } else 308 * is making sure that the sign extension happens correctly in
314 /* One-byte opcode. */ 309 * this calculation, since we need a signed 32-bit result to
315 need_modrm = test_bit(*insn, 310 * be sign-extended to 64 bits when it's added to the %rip
316 (unsigned long *)onebyte_has_modrm); 311 * value and yield the same 64-bit result that the sign-
317 312 * extension of the original signed 32-bit displacement would
318 if (need_modrm) { 313 * have given.
319 u8 modrm = *++insn; 314 */
320 if ((modrm & 0xc7) == 0x05) { 315 newdisp = (u8 *) p->addr + (s64) insn.displacement.value -
321 /* %rip+disp32 addressing mode */ 316 (u8 *) p->ainsn.insn;
322 /* Displacement follows ModRM byte. */ 317 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
323 ++insn; 318 disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn);
324 /* 319 *(s32 *) disp = (s32) newdisp;
325 * The copied instruction uses the %rip-relative
326 * addressing mode. Adjust the displacement for the
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 320 }
343#endif 321#endif
344} 322}
@@ -359,6 +337,8 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 337
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 338int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 339{
340 if (!can_probe((unsigned long)p->addr))
341 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 342 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 343 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 344 if (!p->ainsn.insn)
@@ -472,17 +452,6 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 452{
473 switch (kcb->kprobe_status) { 453 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 454 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 455 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb); 456 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb); 457 set_current_kprobe(p, regs, kcb);
@@ -491,18 +460,16 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
491 kcb->kprobe_status = KPROBE_REENTER; 460 kcb->kprobe_status = KPROBE_REENTER;
492 break; 461 break;
493 case KPROBE_HIT_SS: 462 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 463 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 464 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 465 * codepath should strictly reside in .kprobes.text section.
497 return 0; 466 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 467 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 468 */
500 * to, or just after, single-stepping of a probed 469 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 470 p->addr);
502 * reside in .kprobes.text section. Raise a warning 471 dump_kprobe(p);
503 * to highlight this peculiar case. 472 BUG();
504 */
505 }
506 default: 473 default:
507 /* impossible cases */ 474 /* impossible cases */
508 WARN_ON(1); 475 WARN_ON(1);
@@ -967,8 +934,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 934 ret = NOTIFY_STOP;
968 break; 935 break;
969 case DIE_DEBUG: 936 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 937 if (post_kprobe_handler(args->regs)) {
938 /*
939 * Reset the BS bit in dr6 (pointed by args->err) to
940 * denote completion of processing
941 */
942 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 943 ret = NOTIFY_STOP;
944 }
972 break; 945 break;
973 case DIE_GPF: 946 case DIE_GPF:
974 /* 947 /*
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..c843f8406da2 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -202,6 +203,7 @@ void machine_kexec(struct kimage *image)
202 203
203 /* Interrupts aren't acceptable while we reboot */ 204 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 205 local_irq_disable();
206 hw_breakpoint_disable();
205 207
206 if (image->preserve_context) { 208 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 209#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..4a8bb82248ae 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -18,6 +18,7 @@
18#include <asm/pgtable.h> 18#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 19#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/debugreg.h>
21 22
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 23static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 24 unsigned long addr)
@@ -282,6 +283,7 @@ void machine_kexec(struct kimage *image)
282 283
283 /* Interrupts aren't acceptable while we reboot */ 284 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 285 local_irq_disable();
286 hw_breakpoint_disable();
285 287
286 if (image->preserve_context) { 288 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 289#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..2bcad3926edb 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -73,7 +73,6 @@
73#include <linux/platform_device.h> 73#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 74#include <linux/miscdevice.h>
75#include <linux/capability.h> 75#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 76#include <linux/kernel.h>
78#include <linux/module.h> 77#include <linux/module.h>
79#include <linux/mutex.h> 78#include <linux/mutex.h>
@@ -201,7 +200,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 200
202static int microcode_open(struct inode *unused1, struct file *unused2) 201static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 202{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 203 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 204}
207 205
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..553449951b84 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -174,21 +174,17 @@ static int msr_open(struct inode *inode, struct file *file)
174{ 174{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 175 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 176 struct cpuinfo_x86 *c = &cpu_data(cpu);
177 int ret = 0;
178 177
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 178 cpu = iminor(file->f_path.dentry->d_inode);
181 179
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 180 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
183 ret = -ENXIO; /* No such CPU */ 181 return -ENXIO; /* No such CPU */
184 goto out; 182
185 }
186 c = &cpu_data(cpu); 183 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 184 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 185 return -EIO; /* MSR not supported */
189out: 186
190 unlock_kernel(); 187 return 0;
191 return ret;
192} 188}
193 189
194/* 190/*
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..c563e4c8ff39 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index b2a71dca5642..afcc58b69c7c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,10 +11,11 @@
11#include <asm/gart.h> 11#include <asm/gart.h>
12#include <asm/calgary.h> 12#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 13#include <asm/amd_iommu.h>
14#include <asm/x86_init.h>
14 15
15static int forbid_dac __read_mostly; 16static int forbid_dac __read_mostly;
16 17
17struct dma_map_ops *dma_ops; 18struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 19EXPORT_SYMBOL(dma_ops);
19 20
20static int iommu_sac_force __read_mostly; 21static int iommu_sac_force __read_mostly;
@@ -42,15 +43,10 @@ int iommu_detected __read_mostly = 0;
42 */ 43 */
43int iommu_pass_through __read_mostly; 44int iommu_pass_through __read_mostly;
44 45
45dma_addr_t bad_dma_address __read_mostly = 0; 46/* Dummy device used for NULL arguments (normally ISA). */
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). Better would
49 be probably a smaller DMA mask, but this is bug-to-bug compatible
50 to older i386. */
51struct device x86_dma_fallback_dev = { 47struct device x86_dma_fallback_dev = {
52 .init_name = "fallback device", 48 .init_name = "fallback device",
53 .coherent_dma_mask = DMA_BIT_MASK(32), 49 .coherent_dma_mask = ISA_DMA_BIT_MASK,
54 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, 50 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
55}; 51};
56EXPORT_SYMBOL(x86_dma_fallback_dev); 52EXPORT_SYMBOL(x86_dma_fallback_dev);
@@ -128,20 +124,17 @@ void __init pci_iommu_alloc(void)
128 /* free the range so iommu could get some range less than 4G */ 124 /* free the range so iommu could get some range less than 4G */
129 dma32_free_bootmem(); 125 dma32_free_bootmem();
130#endif 126#endif
127 if (pci_swiotlb_init())
128 return;
131 129
132 /*
133 * The order of these functions is important for
134 * fall-back/fail-over reasons
135 */
136 gart_iommu_hole_init(); 130 gart_iommu_hole_init();
137 131
138 detect_calgary(); 132 detect_calgary();
139 133
140 detect_intel_iommu(); 134 detect_intel_iommu();
141 135
136 /* needs to be called after gart_iommu_hole_init */
142 amd_iommu_detect(); 137 amd_iommu_detect();
143
144 pci_swiotlb_init();
145} 138}
146 139
147void *dma_generic_alloc_coherent(struct device *dev, size_t size, 140void *dma_generic_alloc_coherent(struct device *dev, size_t size,
@@ -216,7 +209,7 @@ static __init int iommu_setup(char *p)
216 if (!strncmp(p, "allowdac", 8)) 209 if (!strncmp(p, "allowdac", 8))
217 forbid_dac = 0; 210 forbid_dac = 0;
218 if (!strncmp(p, "nodac", 5)) 211 if (!strncmp(p, "nodac", 5))
219 forbid_dac = -1; 212 forbid_dac = 1;
220 if (!strncmp(p, "usedac", 6)) { 213 if (!strncmp(p, "usedac", 6)) {
221 forbid_dac = -1; 214 forbid_dac = -1;
222 return 1; 215 return 1;
@@ -291,25 +284,17 @@ static int __init pci_iommu_init(void)
291#ifdef CONFIG_PCI 284#ifdef CONFIG_PCI
292 dma_debug_add_bus(&pci_bus_type); 285 dma_debug_add_bus(&pci_bus_type);
293#endif 286#endif
287 x86_init.iommu.iommu_init();
294 288
295 calgary_iommu_init(); 289 if (swiotlb) {
296 290 printk(KERN_INFO "PCI-DMA: "
297 intel_iommu_init(); 291 "Using software bounce buffering for IO (SWIOTLB)\n");
292 swiotlb_print_info();
293 } else
294 swiotlb_free();
298 295
299 amd_iommu_init();
300
301 gart_iommu_init();
302
303 no_iommu_init();
304 return 0; 296 return 0;
305} 297}
306
307void pci_iommu_shutdown(void)
308{
309 gart_iommu_shutdown();
310
311 amd_iommu_shutdown();
312}
313/* Must execute after PCI subsystem */ 298/* Must execute after PCI subsystem */
314rootfs_initcall(pci_iommu_init); 299rootfs_initcall(pci_iommu_init);
315 300
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a7f1b64f86e0..e6a0d402f171 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,6 +39,7 @@
39#include <asm/swiotlb.h> 39#include <asm/swiotlb.h>
40#include <asm/dma.h> 40#include <asm/dma.h>
41#include <asm/k8.h> 41#include <asm/k8.h>
42#include <asm/x86_init.h>
42 43
43static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 44static unsigned long iommu_bus_base; /* GART remapping area (physical) */
44static unsigned long iommu_size; /* size of remapping area bytes */ 45static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,6 +47,8 @@ static unsigned long iommu_pages; /* .. and in pages */
46 47
47static u32 *iommu_gatt_base; /* Remapping table */ 48static u32 *iommu_gatt_base; /* Remapping table */
48 49
50static dma_addr_t bad_dma_addr;
51
49/* 52/*
50 * If this is disabled the IOMMU will use an optimized flushing strategy 53 * If this is disabled the IOMMU will use an optimized flushing strategy
51 * of only flushing when an mapping is reused. With it true the GART is 54 * of only flushing when an mapping is reused. With it true the GART is
@@ -92,7 +95,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
92 95
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 96 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 97 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 98 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 99 PAGE_SIZE) >> PAGE_SHIFT;
97 100
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 101 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -216,7 +219,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
216 if (panic_on_overflow) 219 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size); 220 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir); 221 iommu_full(dev, size, dir);
219 return bad_dma_address; 222 return bad_dma_addr;
220 } 223 }
221 224
222 for (i = 0; i < npages; i++) { 225 for (i = 0; i < npages; i++) {
@@ -294,7 +297,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
294 int i; 297 int i;
295 298
296#ifdef CONFIG_IOMMU_DEBUG 299#ifdef CONFIG_IOMMU_DEBUG
297 printk(KERN_DEBUG "dma_map_sg overflow\n"); 300 pr_debug("dma_map_sg overflow\n");
298#endif 301#endif
299 302
300 for_each_sg(sg, s, nents, i) { 303 for_each_sg(sg, s, nents, i) {
@@ -302,7 +305,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
302 305
303 if (nonforced_iommu(dev, addr, s->length)) { 306 if (nonforced_iommu(dev, addr, s->length)) {
304 addr = dma_map_area(dev, addr, s->length, dir, 0); 307 addr = dma_map_area(dev, addr, s->length, dir, 0);
305 if (addr == bad_dma_address) { 308 if (addr == bad_dma_addr) {
306 if (i > 0) 309 if (i > 0)
307 gart_unmap_sg(dev, sg, i, dir, NULL); 310 gart_unmap_sg(dev, sg, i, dir, NULL);
308 nents = 0; 311 nents = 0;
@@ -389,12 +392,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
389 if (!dev) 392 if (!dev)
390 dev = &x86_dma_fallback_dev; 393 dev = &x86_dma_fallback_dev;
391 394
392 out = 0; 395 out = 0;
393 start = 0; 396 start = 0;
394 start_sg = sgmap = sg; 397 start_sg = sg;
395 seg_size = 0; 398 sgmap = sg;
396 max_seg_size = dma_get_max_seg_size(dev); 399 seg_size = 0;
397 ps = NULL; /* shut up gcc */ 400 max_seg_size = dma_get_max_seg_size(dev);
401 ps = NULL; /* shut up gcc */
402
398 for_each_sg(sg, s, nents, i) { 403 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 404 dma_addr_t addr = sg_phys(s);
400 405
@@ -417,11 +422,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
417 sgmap, pages, need) < 0) 422 sgmap, pages, need) < 0)
418 goto error; 423 goto error;
419 out++; 424 out++;
420 seg_size = 0; 425
421 sgmap = sg_next(sgmap); 426 seg_size = 0;
422 pages = 0; 427 sgmap = sg_next(sgmap);
423 start = i; 428 pages = 0;
424 start_sg = s; 429 start = i;
430 start_sg = s;
425 } 431 }
426 } 432 }
427 433
@@ -455,7 +461,7 @@ error:
455 461
456 iommu_full(dev, pages << PAGE_SHIFT, dir); 462 iommu_full(dev, pages << PAGE_SHIFT, dir);
457 for_each_sg(sg, s, nents, i) 463 for_each_sg(sg, s, nents, i)
458 s->dma_address = bad_dma_address; 464 s->dma_address = bad_dma_addr;
459 return 0; 465 return 0;
460} 466}
461 467
@@ -479,7 +485,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
479 DMA_BIDIRECTIONAL, align_mask); 485 DMA_BIDIRECTIONAL, align_mask);
480 486
481 flush_gart(); 487 flush_gart();
482 if (paddr != bad_dma_address) { 488 if (paddr != bad_dma_addr) {
483 *dma_addr = paddr; 489 *dma_addr = paddr;
484 return page_address(page); 490 return page_address(page);
485 } 491 }
@@ -499,6 +505,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
499 free_pages((unsigned long)vaddr, get_order(size)); 505 free_pages((unsigned long)vaddr, get_order(size));
500} 506}
501 507
508static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
509{
510 return (dma_addr == bad_dma_addr);
511}
512
502static int no_agp; 513static int no_agp;
503 514
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 515static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -515,7 +526,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
515 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 526 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
516 527
517 if (iommu_size < 64*1024*1024) { 528 if (iommu_size < 64*1024*1024) {
518 printk(KERN_WARNING 529 pr_warning(
519 "PCI-DMA: Warning: Small IOMMU %luMB." 530 "PCI-DMA: Warning: Small IOMMU %luMB."
520 " Consider increasing the AGP aperture in BIOS\n", 531 " Consider increasing the AGP aperture in BIOS\n",
521 iommu_size >> 20); 532 iommu_size >> 20);
@@ -570,28 +581,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
570 aperture_alloc = aper_alloc; 581 aperture_alloc = aper_alloc;
571} 582}
572 583
573static int gart_resume(struct sys_device *dev) 584static void gart_fixup_northbridges(struct sys_device *dev)
574{ 585{
575 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 586 int i;
576 587
577 if (fix_up_north_bridges) { 588 if (!fix_up_north_bridges)
578 int i; 589 return;
579 590
580 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 591 pr_info("PCI-DMA: Restoring GART aperture settings\n");
581 592
582 for (i = 0; i < num_k8_northbridges; i++) { 593 for (i = 0; i < num_k8_northbridges; i++) {
583 struct pci_dev *dev = k8_northbridges[i]; 594 struct pci_dev *dev = k8_northbridges[i];
584 595
585 /* 596 /*
586 * Don't enable translations just yet. That is the next 597 * Don't enable translations just yet. That is the next
587 * step. Restore the pre-suspend aperture settings. 598 * step. Restore the pre-suspend aperture settings.
588 */ 599 */
589 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 600 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
590 aperture_order << 1); 601 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
591 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
592 aperture_alloc >> 25);
593 }
594 } 602 }
603}
604
605static int gart_resume(struct sys_device *dev)
606{
607 pr_info("PCI-DMA: Resuming GART IOMMU\n");
608
609 gart_fixup_northbridges(dev);
595 610
596 enable_gart_translations(); 611 enable_gart_translations();
597 612
@@ -604,15 +619,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
604} 619}
605 620
606static struct sysdev_class gart_sysdev_class = { 621static struct sysdev_class gart_sysdev_class = {
607 .name = "gart", 622 .name = "gart",
608 .suspend = gart_suspend, 623 .suspend = gart_suspend,
609 .resume = gart_resume, 624 .resume = gart_resume,
610 625
611}; 626};
612 627
613static struct sys_device device_gart = { 628static struct sys_device device_gart = {
614 .id = 0, 629 .cls = &gart_sysdev_class,
615 .cls = &gart_sysdev_class,
616}; 630};
617 631
618/* 632/*
@@ -627,7 +641,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
627 void *gatt; 641 void *gatt;
628 int i, error; 642 int i, error;
629 643
630 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 644 pr_info("PCI-DMA: Disabling AGP.\n");
645
631 aper_size = aper_base = info->aper_size = 0; 646 aper_size = aper_base = info->aper_size = 0;
632 dev = NULL; 647 dev = NULL;
633 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
@@ -645,6 +660,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
645 } 660 }
646 if (!aper_base) 661 if (!aper_base)
647 goto nommu; 662 goto nommu;
663
648 info->aper_base = aper_base; 664 info->aper_base = aper_base;
649 info->aper_size = aper_size >> 20; 665 info->aper_size = aper_size >> 20;
650 666
@@ -667,14 +683,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
667 683
668 flush_gart(); 684 flush_gart();
669 685
670 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 686 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
671 aper_base, aper_size>>10); 687 aper_base, aper_size>>10);
672 688
673 return 0; 689 return 0;
674 690
675 nommu: 691 nommu:
676 /* Should not happen anymore */ 692 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 693 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 "falling back to iommu=soft.\n"); 694 "falling back to iommu=soft.\n");
679 return -1; 695 return -1;
680} 696}
@@ -686,14 +702,15 @@ static struct dma_map_ops gart_dma_ops = {
686 .unmap_page = gart_unmap_page, 702 .unmap_page = gart_unmap_page,
687 .alloc_coherent = gart_alloc_coherent, 703 .alloc_coherent = gart_alloc_coherent,
688 .free_coherent = gart_free_coherent, 704 .free_coherent = gart_free_coherent,
705 .mapping_error = gart_mapping_error,
689}; 706};
690 707
691void gart_iommu_shutdown(void) 708static void gart_iommu_shutdown(void)
692{ 709{
693 struct pci_dev *dev; 710 struct pci_dev *dev;
694 int i; 711 int i;
695 712
696 if (no_agp && (dma_ops != &gart_dma_ops)) 713 if (no_agp)
697 return; 714 return;
698 715
699 for (i = 0; i < num_k8_northbridges; i++) { 716 for (i = 0; i < num_k8_northbridges; i++) {
@@ -708,7 +725,7 @@ void gart_iommu_shutdown(void)
708 } 725 }
709} 726}
710 727
711void __init gart_iommu_init(void) 728int __init gart_iommu_init(void)
712{ 729{
713 struct agp_kern_info info; 730 struct agp_kern_info info;
714 unsigned long iommu_start; 731 unsigned long iommu_start;
@@ -718,7 +735,7 @@ void __init gart_iommu_init(void)
718 long i; 735 long i;
719 736
720 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 737 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
721 return; 738 return 0;
722 739
723#ifndef CONFIG_AGP_AMD64 740#ifndef CONFIG_AGP_AMD64
724 no_agp = 1; 741 no_agp = 1;
@@ -730,35 +747,28 @@ void __init gart_iommu_init(void)
730 (agp_copy_info(agp_bridge, &info) < 0); 747 (agp_copy_info(agp_bridge, &info) < 0);
731#endif 748#endif
732 749
733 if (swiotlb)
734 return;
735
736 /* Did we detect a different HW IOMMU? */
737 if (iommu_detected && !gart_iommu_aperture)
738 return;
739
740 if (no_iommu || 750 if (no_iommu ||
741 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 751 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
742 !gart_iommu_aperture || 752 !gart_iommu_aperture ||
743 (no_agp && init_k8_gatt(&info) < 0)) { 753 (no_agp && init_k8_gatt(&info) < 0)) {
744 if (max_pfn > MAX_DMA32_PFN) { 754 if (max_pfn > MAX_DMA32_PFN) {
745 printk(KERN_WARNING "More than 4GB of memory " 755 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
746 "but GART IOMMU not available.\n"); 756 pr_warning("falling back to iommu=soft.\n");
747 printk(KERN_WARNING "falling back to iommu=soft.\n");
748 } 757 }
749 return; 758 return 0;
750 } 759 }
751 760
752 /* need to map that range */ 761 /* need to map that range */
753 aper_size = info.aper_size << 20; 762 aper_size = info.aper_size << 20;
754 aper_base = info.aper_base; 763 aper_base = info.aper_base;
755 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 764 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
765
756 if (end_pfn > max_low_pfn_mapped) { 766 if (end_pfn > max_low_pfn_mapped) {
757 start_pfn = (aper_base>>PAGE_SHIFT); 767 start_pfn = (aper_base>>PAGE_SHIFT);
758 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 768 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
759 } 769 }
760 770
761 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 771 pr_info("PCI-DMA: using GART IOMMU.\n");
762 iommu_size = check_iommu_size(info.aper_base, aper_size); 772 iommu_size = check_iommu_size(info.aper_base, aper_size);
763 iommu_pages = iommu_size >> PAGE_SHIFT; 773 iommu_pages = iommu_size >> PAGE_SHIFT;
764 774
@@ -773,8 +783,7 @@ void __init gart_iommu_init(void)
773 783
774 ret = dma_debug_resize_entries(iommu_pages); 784 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret) 785 if (ret)
776 printk(KERN_DEBUG 786 pr_debug("PCI-DMA: Cannot trace all the entries\n");
777 "PCI-DMA: Cannot trace all the entries\n");
778 } 787 }
779#endif 788#endif
780 789
@@ -784,15 +793,14 @@ void __init gart_iommu_init(void)
784 */ 793 */
785 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 794 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
786 795
787 agp_memory_reserved = iommu_size; 796 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
788 printk(KERN_INFO
789 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
790 iommu_size >> 20); 797 iommu_size >> 20);
791 798
792 iommu_start = aper_size - iommu_size; 799 agp_memory_reserved = iommu_size;
793 iommu_bus_base = info.aper_base + iommu_start; 800 iommu_start = aper_size - iommu_size;
794 bad_dma_address = iommu_bus_base; 801 iommu_bus_base = info.aper_base + iommu_start;
795 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 802 bad_dma_addr = iommu_bus_base;
803 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
796 804
797 /* 805 /*
798 * Unmap the IOMMU part of the GART. The alias of the page is 806 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -814,7 +822,7 @@ void __init gart_iommu_init(void)
814 * the pages as Not-Present: 822 * the pages as Not-Present:
815 */ 823 */
816 wbinvd(); 824 wbinvd();
817 825
818 /* 826 /*
819 * Now all caches are flushed and we can safely enable 827 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility 828 * GART hardware. Doing it early leaves the possibility
@@ -838,6 +846,10 @@ void __init gart_iommu_init(void)
838 846
839 flush_gart(); 847 flush_gart();
840 dma_ops = &gart_dma_ops; 848 dma_ops = &gart_dma_ops;
849 x86_platform.iommu_shutdown = gart_iommu_shutdown;
850 swiotlb = 0;
851
852 return 0;
841} 853}
842 854
843void __init gart_parse_options(char *p) 855void __init gart_parse_options(char *p)
@@ -856,7 +868,7 @@ void __init gart_parse_options(char *p)
856#endif 868#endif
857 if (isdigit(*p) && get_option(&p, &arg)) 869 if (isdigit(*p) && get_option(&p, &arg))
858 iommu_size = arg; 870 iommu_size = arg;
859 if (!strncmp(p, "fullflush", 8)) 871 if (!strncmp(p, "fullflush", 9))
860 iommu_fullflush = 1; 872 iommu_fullflush = 1;
861 if (!strncmp(p, "nofullflush", 11)) 873 if (!strncmp(p, "nofullflush", 11))
862 iommu_fullflush = 0; 874 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..22be12b60a8f 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -33,7 +33,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 33 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 34 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 35 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 36 return DMA_ERROR_CODE;
37 flush_write_buffers(); 37 flush_write_buffers();
38 return bus; 38 return bus;
39} 39}
@@ -103,12 +103,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 103 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 104 .is_phys = 1,
105}; 105};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..e3c0a66b9e77 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,28 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_init - initialize swiotlb if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_init(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
54 if (swiotlb) { 63 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 64 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 65 dma_ops = &swiotlb_dma_ops;
58 } 66 }
67
68 return use_swiotlb;
59} 69}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..744508e7cfdd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -10,6 +10,7 @@
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <trace/events/power.h> 12#include <trace/events/power.h>
13#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 14#include <asm/system.h>
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/syscalls.h> 16#include <asm/syscalls.h>
@@ -17,6 +18,7 @@
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <asm/i387.h> 19#include <asm/i387.h>
19#include <asm/ds.h> 20#include <asm/ds.h>
21#include <asm/debugreg.h>
20 22
21unsigned long idle_halt; 23unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 24EXPORT_SYMBOL(idle_halt);
@@ -103,14 +105,7 @@ void flush_thread(void)
103 } 105 }
104#endif 106#endif
105 107
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 108 flush_ptrace_hw_breakpoint(tsk);
107
108 tsk->thread.debugreg0 = 0;
109 tsk->thread.debugreg1 = 0;
110 tsk->thread.debugreg2 = 0;
111 tsk->thread.debugreg3 = 0;
112 tsk->thread.debugreg6 = 0;
113 tsk->thread.debugreg7 = 0;
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 109 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 110 /*
116 * Forget coprocessor state.. 111 * Forget coprocessor state..
@@ -192,16 +187,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 187 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 188 update_debugctlmsr(next->debugctlmsr);
194 189
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 190 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 191 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 192 /* prev and next are different */
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..075580b35682 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,6 +58,7 @@
58#include <asm/idle.h> 58#include <asm/idle.h>
59#include <asm/syscalls.h> 59#include <asm/syscalls.h>
60#include <asm/ds.h> 60#include <asm/ds.h>
61#include <asm/debugreg.h>
61 62
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 64
@@ -134,7 +135,7 @@ void __show_regs(struct pt_regs *regs, int all)
134 ss = regs->ss & 0xffff; 135 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 136 gs = get_user_gs(regs);
136 } else { 137 } else {
137 sp = (unsigned long) (&regs->sp); 138 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 139 savesegment(ss, ss);
139 savesegment(gs, gs); 140 savesegment(gs, gs);
140 } 141 }
@@ -187,7 +188,7 @@ void __show_regs(struct pt_regs *regs, int all)
187 188
188void show_regs(struct pt_regs *regs) 189void show_regs(struct pt_regs *regs)
189{ 190{
190 __show_regs(regs, 1); 191 show_registers(regs);
191 show_trace(NULL, regs, &regs->sp, regs->bp); 192 show_trace(NULL, regs, &regs->sp, regs->bp);
192} 193}
193 194
@@ -259,7 +260,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 260
260 task_user_gs(p) = get_user_gs(regs); 261 task_user_gs(p) = get_user_gs(regs);
261 262
263 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 264 tsk = current;
265 err = -ENOMEM;
266
267 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
268
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 269 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 270 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 271 IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbcaa490..a98fe88fab64 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,6 +52,7 @@
52#include <asm/idle.h> 52#include <asm/idle.h>
53#include <asm/syscalls.h> 53#include <asm/syscalls.h>
54#include <asm/ds.h> 54#include <asm/ds.h>
55#include <asm/debugreg.h>
55 56
56asmlinkage extern void ret_from_fork(void); 57asmlinkage extern void ret_from_fork(void);
57 58
@@ -226,8 +227,7 @@ void __show_regs(struct pt_regs *regs, int all)
226 227
227void show_regs(struct pt_regs *regs) 228void show_regs(struct pt_regs *regs)
228{ 229{
229 printk(KERN_INFO "CPU %d:", smp_processor_id()); 230 show_registers(regs);
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 232}
233 233
@@ -297,12 +297,16 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
297 297
298 p->thread.fs = me->thread.fs; 298 p->thread.fs = me->thread.fs;
299 p->thread.gs = me->thread.gs; 299 p->thread.gs = me->thread.gs;
300 p->thread.io_bitmap_ptr = NULL;
300 301
301 savesegment(gs, p->thread.gsindex); 302 savesegment(gs, p->thread.gsindex);
302 savesegment(fs, p->thread.fsindex); 303 savesegment(fs, p->thread.fsindex);
303 savesegment(es, p->thread.es); 304 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 305 savesegment(ds, p->thread.ds);
305 306
307 err = -ENOMEM;
308 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
309
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 310 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 311 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 312 if (!p->thread.io_bitmap_ptr) {
@@ -341,6 +345,7 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 345 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 346 p->thread.io_bitmap_max = 0;
343 } 347 }
348
344 return err; 349 return err;
345} 350}
346 351
@@ -495,6 +500,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 500 */
496 if (preload_fpu) 501 if (preload_fpu)
497 __math_state_restore(); 502 __math_state_restore();
503
498 return prev_p; 504 return prev_p;
499} 505}
500 506
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..04d182a7cfdb 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,8 @@
22#include <linux/seccomp.h> 22#include <linux/seccomp.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/workqueue.h> 24#include <linux/workqueue.h>
25#include <linux/perf_event.h>
26#include <linux/hw_breakpoint.h>
25 27
26#include <asm/uaccess.h> 28#include <asm/uaccess.h>
27#include <asm/pgtable.h> 29#include <asm/pgtable.h>
@@ -34,6 +36,7 @@
34#include <asm/prctl.h> 36#include <asm/prctl.h>
35#include <asm/proto.h> 37#include <asm/proto.h>
36#include <asm/ds.h> 38#include <asm/ds.h>
39#include <asm/hw_breakpoint.h>
37 40
38#include "tls.h" 41#include "tls.h"
39 42
@@ -49,6 +52,118 @@ enum x86_regset {
49 REGSET_IOPERM32, 52 REGSET_IOPERM32,
50}; 53};
51 54
55struct pt_regs_offset {
56 const char *name;
57 int offset;
58};
59
60#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
61#define REG_OFFSET_END {.name = NULL, .offset = 0}
62
63static const struct pt_regs_offset regoffset_table[] = {
64#ifdef CONFIG_X86_64
65 REG_OFFSET_NAME(r15),
66 REG_OFFSET_NAME(r14),
67 REG_OFFSET_NAME(r13),
68 REG_OFFSET_NAME(r12),
69 REG_OFFSET_NAME(r11),
70 REG_OFFSET_NAME(r10),
71 REG_OFFSET_NAME(r9),
72 REG_OFFSET_NAME(r8),
73#endif
74 REG_OFFSET_NAME(bx),
75 REG_OFFSET_NAME(cx),
76 REG_OFFSET_NAME(dx),
77 REG_OFFSET_NAME(si),
78 REG_OFFSET_NAME(di),
79 REG_OFFSET_NAME(bp),
80 REG_OFFSET_NAME(ax),
81#ifdef CONFIG_X86_32
82 REG_OFFSET_NAME(ds),
83 REG_OFFSET_NAME(es),
84 REG_OFFSET_NAME(fs),
85 REG_OFFSET_NAME(gs),
86#endif
87 REG_OFFSET_NAME(orig_ax),
88 REG_OFFSET_NAME(ip),
89 REG_OFFSET_NAME(cs),
90 REG_OFFSET_NAME(flags),
91 REG_OFFSET_NAME(sp),
92 REG_OFFSET_NAME(ss),
93 REG_OFFSET_END,
94};
95
96/**
97 * regs_query_register_offset() - query register offset from its name
98 * @name: the name of a register
99 *
100 * regs_query_register_offset() returns the offset of a register in struct
101 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
102 */
103int regs_query_register_offset(const char *name)
104{
105 const struct pt_regs_offset *roff;
106 for (roff = regoffset_table; roff->name != NULL; roff++)
107 if (!strcmp(roff->name, name))
108 return roff->offset;
109 return -EINVAL;
110}
111
112/**
113 * regs_query_register_name() - query register name from its offset
114 * @offset: the offset of a register in struct pt_regs.
115 *
116 * regs_query_register_name() returns the name of a register from its
117 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
118 */
119const char *regs_query_register_name(unsigned int offset)
120{
121 const struct pt_regs_offset *roff;
122 for (roff = regoffset_table; roff->name != NULL; roff++)
123 if (roff->offset == offset)
124 return roff->name;
125 return NULL;
126}
127
128static const int arg_offs_table[] = {
129#ifdef CONFIG_X86_32
130 [0] = offsetof(struct pt_regs, ax),
131 [1] = offsetof(struct pt_regs, dx),
132 [2] = offsetof(struct pt_regs, cx)
133#else /* CONFIG_X86_64 */
134 [0] = offsetof(struct pt_regs, di),
135 [1] = offsetof(struct pt_regs, si),
136 [2] = offsetof(struct pt_regs, dx),
137 [3] = offsetof(struct pt_regs, cx),
138 [4] = offsetof(struct pt_regs, r8),
139 [5] = offsetof(struct pt_regs, r9)
140#endif
141};
142
143/**
144 * regs_get_argument_nth() - get Nth argument at function call
145 * @regs: pt_regs which contains registers at function entry.
146 * @n: argument number.
147 *
148 * regs_get_argument_nth() returns @n th argument of a function call.
149 * Since usually the kernel stack will be changed right after function entry,
150 * you must use this at function entry. If the @n th entry is NOT in the
151 * kernel stack or pt_regs, this returns 0.
152 */
153unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n)
154{
155 if (n < ARRAY_SIZE(arg_offs_table))
156 return *(unsigned long *)((char *)regs + arg_offs_table[n]);
157 else {
158 /*
159 * The typical case: arg n is on the stack.
160 * (Note: stack[0] = return address, so skip it)
161 */
162 n -= ARRAY_SIZE(arg_offs_table);
163 return regs_get_kernel_stack_nth(regs, 1 + n);
164 }
165}
166
52/* 167/*
53 * does not yet catch signals sent when the child dies. 168 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 169 * in exit.c or in signal.c.
@@ -137,11 +252,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 252 return 0;
138} 253}
139 254
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 255#else /* CONFIG_X86_64 */
146 256
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 257#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +376,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 376 return 0;
267} 377}
268 378
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 379#endif /* CONFIG_X86_32 */
279 380
280static unsigned long get_flags(struct task_struct *task) 381static unsigned long get_flags(struct task_struct *task)
@@ -454,99 +555,239 @@ static int genregs_set(struct task_struct *target,
454 return ret; 555 return ret;
455} 556}
456 557
558static void ptrace_triggered(struct perf_event *bp, void *data)
559{
560 int i;
561 struct thread_struct *thread = &(current->thread);
562
563 /*
564 * Store in the virtual DR6 register the fact that the breakpoint
565 * was hit so the thread's debugger will see it.
566 */
567 for (i = 0; i < HBP_NUM; i++) {
568 if (thread->ptrace_bps[i] == bp)
569 break;
570 }
571
572 thread->debugreg6 |= (DR_TRAP0 << i);
573}
574
457/* 575/*
458 * This function is trivial and will be inlined by the compiler. 576 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 577 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 578 *
461 */ 579 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 580static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 581{
464 switch (n) { 582 int i;
465 case 0: return child->thread.debugreg0; 583 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 584 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 585
468 case 3: return child->thread.debugreg3; 586 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 587 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 588 info = counter_arch_bp(bp[i]);
589 dr7 |= encode_dr7(i, info->len, info->type);
590 }
471 } 591 }
472 return 0; 592
593 return dr7;
473} 594}
474 595
475static int ptrace_set_debugreg(struct task_struct *child, 596static struct perf_event *
476 int n, unsigned long data) 597ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
598 struct task_struct *tsk, int disabled)
477{ 599{
478 int i; 600 int err;
601 int gen_len, gen_type;
602 DEFINE_BREAKPOINT_ATTR(attr);
479 603
480 if (unlikely(n == 4 || n == 5)) 604 /*
481 return -EIO; 605 * We shoud have at least an inactive breakpoint at this
606 * slot. It means the user is writing dr7 without having
607 * written the address register first
608 */
609 if (!bp)
610 return ERR_PTR(-EINVAL);
482 611
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 612 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 613 if (err)
614 return ERR_PTR(err);
485 615
486 switch (n) { 616 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 617 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 618 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 619 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 620
492 case 6: 621 return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
493 if ((data & ~0xffffffffUL) != 0) 622}
494 return -EIO; 623
495 child->thread.debugreg6 = data; 624/*
496 break; 625 * Handle ptrace writes to debug register 7.
626 */
627static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
628{
629 struct thread_struct *thread = &(tsk->thread);
630 unsigned long old_dr7;
631 int i, orig_ret = 0, rc = 0;
632 int enabled, second_pass = 0;
633 unsigned len, type;
634 struct perf_event *bp;
635
636 data &= ~DR_CONTROL_RESERVED;
637 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
638restore:
639 /*
640 * Loop through all the hardware breakpoints, making the
641 * appropriate changes to each.
642 */
643 for (i = 0; i < HBP_NUM; i++) {
644 enabled = decode_dr7(data, i, &len, &type);
645 bp = thread->ptrace_bps[i];
646
647 if (!enabled) {
648 if (bp) {
649 /*
650 * Don't unregister the breakpoints right-away,
651 * unless all register_user_hw_breakpoint()
652 * requests have succeeded. This prevents
653 * any window of opportunity for debug
654 * register grabbing by other users.
655 */
656 if (!second_pass)
657 continue;
658
659 thread->ptrace_bps[i] = NULL;
660 bp = ptrace_modify_breakpoint(bp, len, type,
661 tsk, 1);
662 if (IS_ERR(bp)) {
663 rc = PTR_ERR(bp);
664 thread->ptrace_bps[i] = NULL;
665 break;
666 }
667 thread->ptrace_bps[i] = bp;
668 }
669 continue;
670 }
671
672 bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
673
674 /* Incorrect bp, or we have a bug in bp API */
675 if (IS_ERR(bp)) {
676 rc = PTR_ERR(bp);
677 thread->ptrace_bps[i] = NULL;
678 break;
679 }
680 thread->ptrace_bps[i] = bp;
681 }
682 /*
683 * Make a second pass to free the remaining unused breakpoints
684 * or to restore the original breakpoints if an error occurred.
685 */
686 if (!second_pass) {
687 second_pass = 1;
688 if (rc < 0) {
689 orig_ret = rc;
690 data = old_dr7;
691 }
692 goto restore;
693 }
694 return ((orig_ret < 0) ? orig_ret : rc);
695}
696
697/*
698 * Handle PTRACE_PEEKUSR calls for the debug register area.
699 */
700static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
701{
702 struct thread_struct *thread = &(tsk->thread);
703 unsigned long val = 0;
497 704
498 case 7: 705 if (n < HBP_NUM) {
706 struct perf_event *bp;
707 bp = thread->ptrace_bps[n];
708 if (!bp)
709 return 0;
710 val = bp->hw.info.address;
711 } else if (n == 6) {
712 val = thread->debugreg6;
713 } else if (n == 7) {
714 val = ptrace_get_dr7(thread->ptrace_bps);
715 }
716 return val;
717}
718
719static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
720 unsigned long addr)
721{
722 struct perf_event *bp;
723 struct thread_struct *t = &tsk->thread;
724 DEFINE_BREAKPOINT_ATTR(attr);
725
726 if (!t->ptrace_bps[nr]) {
499 /* 727 /*
500 * Sanity-check data. Take one half-byte at once with 728 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 729 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 730 */
529#ifdef CONFIG_X86_32 731 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 732 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 733 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 734 attr.disabled = 1;
533#endif 735
534 data &= ~DR_CONTROL_RESERVED; 736 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 737 } else {
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 738 bp = t->ptrace_bps[nr];
537 return -EIO; 739 t->ptrace_bps[nr] = NULL;
538 child->thread.debugreg7 = data; 740
539 if (data) 741 attr = bp->attr;
540 set_tsk_thread_flag(child, TIF_DEBUG); 742 attr.bp_addr = addr;
541 else 743 bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk);
542 clear_tsk_thread_flag(child, TIF_DEBUG);
543 break;
544 } 744 }
745 /*
746 * CHECKME: the previous code returned -EIO if the addr wasn't a
747 * valid task virtual addr. The new one will return -EINVAL in this
748 * case.
749 * -EINVAL may be what we want for in-kernel breakpoints users, but
750 * -EIO looks better for ptrace, since we refuse a register writing
751 * for the user. And anyway this is the previous behaviour.
752 */
753 if (IS_ERR(bp))
754 return PTR_ERR(bp);
755
756 t->ptrace_bps[nr] = bp;
545 757
546 return 0; 758 return 0;
547} 759}
548 760
549/* 761/*
762 * Handle PTRACE_POKEUSR calls for the debug register area.
763 */
764int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
765{
766 struct thread_struct *thread = &(tsk->thread);
767 int rc = 0;
768
769 /* There are no DR4 or DR5 registers */
770 if (n == 4 || n == 5)
771 return -EIO;
772
773 if (n == 6) {
774 thread->debugreg6 = val;
775 goto ret_path;
776 }
777 if (n < HBP_NUM) {
778 rc = ptrace_set_breakpoint_addr(tsk, n, val);
779 if (rc)
780 return rc;
781 }
782 /* All that's left is DR7 */
783 if (n == 7)
784 rc = ptrace_write_dr7(tsk, val);
785
786ret_path:
787 return rc;
788}
789
790/*
550 * These access the current or another (stopped) task's io permission 791 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 792 * bitmap for debugging or core dump.
552 */ 793 */
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 6caf26010970..1545bc0c9845 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
23# include <linux/ctype.h> 23# include <linux/ctype.h>
24# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
25#else 25#else
26# include <asm/iommu.h> 26# include <asm/x86_init.h>
27#endif 27#endif
28 28
29/* 29/*
@@ -630,7 +630,7 @@ void native_machine_shutdown(void)
630#endif 630#endif
631 631
632#ifdef CONFIG_X86_64 632#ifdef CONFIG_X86_64
633 pci_iommu_shutdown(); 633 x86_platform.iommu_shutdown();
634#endif 634#endif
635} 635}
636 636
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2a34f9c5be21..82e88cdda9bc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -109,6 +109,7 @@
109#ifdef CONFIG_X86_64 109#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 110#include <asm/numa_64.h>
111#endif 111#endif
112#include <asm/mce.h>
112 113
113/* 114/*
114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 115 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -247,7 +248,7 @@ EXPORT_SYMBOL(edd);
247 * from boot_params into a safe place. 248 * from boot_params into a safe place.
248 * 249 *
249 */ 250 */
250static inline void copy_edd(void) 251static inline void __init copy_edd(void)
251{ 252{
252 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 253 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
253 sizeof(edd.mbr_signature)); 254 sizeof(edd.mbr_signature));
@@ -256,7 +257,7 @@ static inline void copy_edd(void)
256 edd.edd_info_nr = boot_params.eddbuf_entries; 257 edd.edd_info_nr = boot_params.eddbuf_entries;
257} 258}
258#else 259#else
259static inline void copy_edd(void) 260static inline void __init copy_edd(void)
260{ 261{
261} 262}
262#endif 263#endif
@@ -1031,6 +1032,8 @@ void __init setup_arch(char **cmdline_p)
1031#endif 1032#endif
1032#endif 1033#endif
1033 x86_init.oem.banner(); 1034 x86_init.oem.banner();
1035
1036 mcheck_init();
1034} 1037}
1035 1038
1036#ifdef CONFIG_X86_32 1039#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..fbf3b07c8567 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -799,15 +799,6 @@ static void do_signal(struct pt_regs *regs)
799 799
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 800 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 801 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 802 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 803 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 804 /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..324f2a44c221 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1250,16 +1250,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1250void cpu_disable_common(void)
1251{ 1251{
1252 int cpu = smp_processor_id(); 1252 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1253
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1254 remove_siblinginfo(cpu);
1264 1255
1265 /* It's now safe to remove this processor from the online map */ 1256 /* It's now safe to remove this processor from the online map */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,56 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536 536
537 /* Catch kmemcheck conditions first of all! */ 537 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 538 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 539 return;
540 540
541 /* DR6 may or may not be cleared by the CPU */
542 set_debugreg(0, 6);
541 /* 543 /*
542 * The processor cleared BTF, so don't mark that we need it set. 544 * The processor cleared BTF, so don't mark that we need it set.
543 */ 545 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 546 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 547 tsk->thread.debugctlmsr = 0;
546 548
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 549 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 550 tsk->thread.debugreg6 = dr6;
551
552 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
553 SIGTRAP) == NOTIFY_STOP)
549 return; 554 return;
550 555
551 /* It's safe to allow irq's after DR6 has been saved */ 556 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 557 preempt_conditional_sti(regs);
553 558
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 559 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 560 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 561 error_code, 1);
557 goto clear_dr7; 562 return;
558 } 563 }
559 564
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 565 /*
569 * Single-stepping through TF: make sure we ignore any events in 566 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 567 * kernel space, but re-enable TF when returning to user mode.
568 *
569 * We already checked v86 mode above, so we can check for kernel mode
570 * by just checking the CPL of CS.
571 */ 571 */
572 if (condition & DR_STEP) { 572 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 573 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 574 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
575 regs->flags &= ~X86_EFLAGS_TF;
575 } 576 }
576 577 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 578 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 579 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 580 preempt_conditional_cli(regs);
588 return;
589 581
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 582 return;
604} 583}
605 584
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..eed156851f5d 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..61d805df4c91 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,25 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
12#include <linux/irq.h> 13#include <linux/irq.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 16#include <asm/uv/uv_irq.h>
17#include <asm/uv/uv_hub.h>
18
19/* MMR offset and pnode of hub sourcing interrupts for a given irq */
20struct uv_irq_2_mmr_pnode{
21 struct rb_node list;
22 unsigned long offset;
23 int pnode;
24 int irq;
25};
26
27static spinlock_t uv_irq_lock;
28static struct rb_root uv_irq_root;
29
30static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 31
17static void uv_noop(unsigned int irq) 32static void uv_noop(unsigned int irq)
18{ 33{
@@ -39,25 +54,214 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 54 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 55 .eoi = uv_ack_apic,
41 .end = uv_noop, 56 .end = uv_noop,
57 .set_affinity = uv_set_irq_affinity,
42}; 58};
43 59
44/* 60/*
61 * Add offset and pnode information of the hub sourcing interrupts to the
62 * rb tree for a specific irq.
63 */
64static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
65{
66 struct rb_node **link = &uv_irq_root.rb_node;
67 struct rb_node *parent = NULL;
68 struct uv_irq_2_mmr_pnode *n;
69 struct uv_irq_2_mmr_pnode *e;
70 unsigned long irqflags;
71
72 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
73 uv_blade_to_memory_nid(blade));
74 if (!n)
75 return -ENOMEM;
76
77 n->irq = irq;
78 n->offset = offset;
79 n->pnode = uv_blade_to_pnode(blade);
80 spin_lock_irqsave(&uv_irq_lock, irqflags);
81 /* Find the right place in the rbtree: */
82 while (*link) {
83 parent = *link;
84 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
85
86 if (unlikely(irq == e->irq)) {
87 /* irq entry exists */
88 e->pnode = uv_blade_to_pnode(blade);
89 e->offset = offset;
90 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
91 kfree(n);
92 return 0;
93 }
94
95 if (irq < e->irq)
96 link = &(*link)->rb_left;
97 else
98 link = &(*link)->rb_right;
99 }
100
101 /* Insert the node into the rbtree. */
102 rb_link_node(&n->list, parent, link);
103 rb_insert_color(&n->list, &uv_irq_root);
104
105 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
106 return 0;
107}
108
109/* Retrieve offset and pnode information from the rb tree for a specific irq */
110int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
111{
112 struct uv_irq_2_mmr_pnode *e;
113 struct rb_node *n;
114 unsigned long irqflags;
115
116 spin_lock_irqsave(&uv_irq_lock, irqflags);
117 n = uv_irq_root.rb_node;
118 while (n) {
119 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
120
121 if (e->irq == irq) {
122 *offset = e->offset;
123 *pnode = e->pnode;
124 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
125 return 0;
126 }
127
128 if (irq < e->irq)
129 n = n->rb_left;
130 else
131 n = n->rb_right;
132 }
133 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
134 return -1;
135}
136
137/*
138 * Re-target the irq to the specified CPU and enable the specified MMR located
139 * on the specified blade to allow the sending of MSIs to the specified CPU.
140 */
141static int
142arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
143 unsigned long mmr_offset, int restrict)
144{
145 const struct cpumask *eligible_cpu = cpumask_of(cpu);
146 struct irq_desc *desc = irq_to_desc(irq);
147 struct irq_cfg *cfg;
148 int mmr_pnode;
149 unsigned long mmr_value;
150 struct uv_IO_APIC_route_entry *entry;
151 int err;
152
153 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
154 sizeof(unsigned long));
155
156 cfg = irq_cfg(irq);
157
158 err = assign_irq_vector(irq, cfg, eligible_cpu);
159 if (err != 0)
160 return err;
161
162 if (restrict == UV_AFFINITY_CPU)
163 desc->status |= IRQ_NO_BALANCING;
164 else
165 desc->status |= IRQ_MOVE_PCNTXT;
166
167 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
168 irq_name);
169
170 mmr_value = 0;
171 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
172 entry->vector = cfg->vector;
173 entry->delivery_mode = apic->irq_delivery_mode;
174 entry->dest_mode = apic->irq_dest_mode;
175 entry->polarity = 0;
176 entry->trigger = 0;
177 entry->mask = 0;
178 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
179
180 mmr_pnode = uv_blade_to_pnode(mmr_blade);
181 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
182
183 if (cfg->move_in_progress)
184 send_cleanup_vector(cfg);
185
186 return irq;
187}
188
189/*
190 * Disable the specified MMR located on the specified blade so that MSIs are
191 * longer allowed to be sent.
192 */
193static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
194{
195 unsigned long mmr_value;
196 struct uv_IO_APIC_route_entry *entry;
197
198 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
199 sizeof(unsigned long));
200
201 mmr_value = 0;
202 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
203 entry->mask = 1;
204
205 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
206}
207
208static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
209{
210 struct irq_desc *desc = irq_to_desc(irq);
211 struct irq_cfg *cfg = desc->chip_data;
212 unsigned int dest;
213 unsigned long mmr_value;
214 struct uv_IO_APIC_route_entry *entry;
215 unsigned long mmr_offset;
216 unsigned mmr_pnode;
217
218 dest = set_desc_affinity(desc, mask);
219 if (dest == BAD_APICID)
220 return -1;
221
222 mmr_value = 0;
223 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
224
225 entry->vector = cfg->vector;
226 entry->delivery_mode = apic->irq_delivery_mode;
227 entry->dest_mode = apic->irq_dest_mode;
228 entry->polarity = 0;
229 entry->trigger = 0;
230 entry->mask = 0;
231 entry->dest = dest;
232
233 /* Get previously stored MMR and pnode of hub sourcing interrupts */
234 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
235 return -1;
236
237 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
238
239 if (cfg->move_in_progress)
240 send_cleanup_vector(cfg);
241
242 return 0;
243}
244
245/*
45 * Set up a mapping of an available irq and vector, and enable the specified 246 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 247 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 248 * interrupt is raised.
48 */ 249 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 251 unsigned long mmr_offset, int restrict)
51{ 252{
52 int irq; 253 int irq, ret;
53 int ret; 254
255 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 256
55 irq = create_irq();
56 if (irq <= 0) 257 if (irq <= 0)
57 return -EBUSY; 258 return -EBUSY;
58 259
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 261 restrict);
262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else
61 destroy_irq(irq); 265 destroy_irq(irq);
62 266
63 return ret; 267 return ret;
@@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 275 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 276 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 277 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 278void uv_teardown_irq(unsigned int irq)
75{ 279{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 280 struct uv_irq_2_mmr_pnode *e;
281 struct rb_node *n;
282 unsigned long irqflags;
283
284 spin_lock_irqsave(&uv_irq_lock, irqflags);
285 n = uv_irq_root.rb_node;
286 while (n) {
287 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
288 if (e->irq == irq) {
289 arch_disable_uv_irq(e->pnode, e->offset);
290 rb_erase(n, &uv_irq_root);
291 kfree(e);
292 break;
293 }
294 if (irq < e->irq)
295 n = n->rb_left;
296 else
297 n = n->rb_right;
298 }
299 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 300 destroy_irq(irq);
78} 301}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 302EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553a1b17..abda6f53e71e 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -183,7 +183,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 183 return;
184 } 184 }
185 185
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 186 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 188 /*
189 * Validate version 189 * Validate version
@@ -486,7 +486,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 486}
487 487
488static struct irq_chip cobalt_irq_type = { 488static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 489 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 490 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 491 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 492 .enable = enable_cobalt_irq,
@@ -523,7 +523,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 523}
524 524
525static struct irq_chip piix4_master_irq_type = { 525static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 526 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 527 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 528 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 529 .end = end_piix4_master_irq,
@@ -531,7 +531,7 @@ static struct irq_chip piix4_master_irq_type = {
531 531
532 532
533static struct irq_chip piix4_virtual_irq_type = { 533static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 534 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq, 535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq, 536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq, 537 .disable = disable_8259A_irq,
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..a1029769b6f2 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -30,9 +30,8 @@ EXPORT_SYMBOL(__put_user_8);
30 30
31EXPORT_SYMBOL(copy_user_generic); 31EXPORT_SYMBOL(copy_user_generic);
32EXPORT_SYMBOL(__copy_user_nocache); 32EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 33EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 34EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 35
37EXPORT_SYMBOL(copy_page); 36EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 37EXPORT_SYMBOL(clear_page);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a2c2ed..d11c5ff7c65e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -14,10 +14,13 @@
14#include <asm/time.h> 14#include <asm/time.h>
15#include <asm/irq.h> 15#include <asm/irq.h>
16#include <asm/tsc.h> 16#include <asm/tsc.h>
17#include <asm/iommu.h>
17 18
18void __cpuinit x86_init_noop(void) { } 19void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { } 20void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { } 21void __init x86_init_pgd_noop(pgd_t *unused) { }
22int __init iommu_init_noop(void) { return 0; }
23void iommu_shutdown_noop(void) { }
21 24
22/* 25/*
23 * The platform setup functions are preset with the default functions 26 * The platform setup functions are preset with the default functions
@@ -62,6 +65,10 @@ struct x86_init_ops x86_init __initdata = {
62 .tsc_pre_init = x86_init_noop, 65 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init, 66 .timer_init = hpet_time_init,
64 }, 67 },
68
69 .iommu = {
70 .iommu_init = iommu_init_noop,
71 },
65}; 72};
66 73
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 74struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
@@ -72,4 +79,5 @@ struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc, 79 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time, 80 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss, 81 .set_wallclock = mach_set_rtc_mmss,
82 .iommu_shutdown = iommu_shutdown_noop,
75}; 83};