aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2008-07-27 07:54:08 -0400
committerHaavard Skinnemoen <haavard.skinnemoen@atmel.com>2008-07-27 07:54:08 -0400
commiteda3d8f5604860aae1bb9996bb5efc4213778369 (patch)
tree9d3887d2665bcc5f5abf200758794545c7b2c69b /arch/x86/kernel
parent87a9f704658a40940e740b1d73d861667e9164d3 (diff)
parent8be1a6d6c77ab4532e4476fdb8177030ef48b52c (diff)
Merge commit 'upstream/master'
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/boot.c6
-rw-r--r--arch/x86/kernel/acpi/cstate.c3
-rw-r--r--arch/x86/kernel/acpi/sleep.c14
-rw-r--r--arch/x86/kernel/amd_iommu.c235
-rw-r--r--arch/x86/kernel/amd_iommu_init.c361
-rw-r--r--arch/x86/kernel/aperture_64.c1
-rw-r--r--arch/x86/kernel/apic_32.c175
-rw-r--r--arch/x86/kernel/apic_64.c26
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/bios_uv.c48
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/bugs.c23
-rw-r--r--arch/x86/kernel/cpu/common_64.c15
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c16
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c23
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c157
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c10
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/proc.c2
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/e820.c33
-rw-r--r--arch/x86/kernel/early-quirks.c5
-rw-r--r--arch/x86/kernel/entry_32.S79
-rw-r--r--arch/x86/kernel/entry_64.S175
-rw-r--r--arch/x86/kernel/genapic_flat_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c27
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S1
-rw-r--r--arch/x86/kernel/hpet.c10
-rw-r--r--arch/x86/kernel/io_apic_32.c53
-rw-r--r--arch/x86/kernel/io_apic_64.c53
-rw-r--r--arch/x86/kernel/io_delay.c3
-rw-r--r--arch/x86/kernel/ipi.c6
-rw-r--r--arch/x86/kernel/irq_32.c7
-rw-r--r--arch/x86/kernel/irqinit_64.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c8
-rw-r--r--arch/x86/kernel/kprobes.c7
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c39
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/microcode.c23
-rw-r--r--arch/x86/kernel/module_64.c11
-rw-r--r--arch/x86/kernel/mpparse.c208
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/numaq_32.c197
-rw-r--r--arch/x86/kernel/paravirt.c31
-rw-r--r--arch/x86/kernel/pci-calgary_64.c160
-rw-r--r--arch/x86/kernel/pci-dma.c50
-rw-r--r--arch/x86/kernel/pci-gart_64.c8
-rw-r--r--arch/x86/kernel/pci-nommu.c16
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c4
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c58
-rw-r--r--arch/x86/kernel/ptrace.c151
-rw-r--r--arch/x86/kernel/reboot.c22
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S174
-rw-r--r--arch/x86/kernel/setup.c29
-rw-r--r--arch/x86/kernel/setup_percpu.c6
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c62
-rw-r--r--arch/x86/kernel/smpboot.c116
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/time_32.c1
-rw-r--r--arch/x86/kernel/traps_32.c118
-rw-r--r--arch/x86/kernel/traps_64.c48
-rw-r--r--arch/x86/kernel/visws_quirks.c42
-rw-r--r--arch/x86/kernel/vmi_32.c1
83 files changed, 2147 insertions, 1196 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..3db651fc8ec5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE 9ifdef CONFIG_FTRACE
10# Do not profile debug utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg
13endif 14endif
14 15
15# 16#
@@ -102,6 +103,7 @@ obj-$(CONFIG_OLPC) += olpc.o
102# 64 bit specific files 103# 64 bit specific files
103ifeq ($(CONFIG_X86_64),y) 104ifeq ($(CONFIG_X86_64),y)
104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o
105 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
106 obj-$(CONFIG_AUDIT) += audit_64.o 108 obj-$(CONFIG_AUDIT) += audit_64.o
107 109
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f489d7a9be92..fa88a1d71290 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1022#endif 1022#endif
1023 set_bit(MP_ISA_BUS, mp_bus_not_pci); 1023 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1024 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); 1024 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
1025 1025
1026#ifdef CONFIG_X86_ES7000 1026#ifdef CONFIG_X86_ES7000
1027 /* 1027 /*
@@ -1127,8 +1127,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1127 return gsi; 1127 return gsi;
1128 } 1128 }
1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1130 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1130 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n",
1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1132#ifdef CONFIG_X86_32 1132#ifdef CONFIG_X86_32
1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1134#else 1134#else
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index c2502eb9aa83..9220cf46aa10 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -73,6 +73,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
73 struct cpuinfo_x86 *c = &cpu_data(cpu); 73 struct cpuinfo_x86 *c = &cpu_data(cpu);
74 74
75 cpumask_t saved_mask; 75 cpumask_t saved_mask;
76 cpumask_of_cpu_ptr(new_mask, cpu);
76 int retval; 77 int retval;
77 unsigned int eax, ebx, ecx, edx; 78 unsigned int eax, ebx, ecx, edx;
78 unsigned int edx_part; 79 unsigned int edx_part;
@@ -91,7 +92,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
91 92
92 /* Make sure we are running on right CPU */ 93 /* Make sure we are running on right CPU */
93 saved_mask = current->cpus_allowed; 94 saved_mask = current->cpus_allowed;
94 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 95 retval = set_cpus_allowed_ptr(current, new_mask);
95 if (retval) 96 if (retval)
96 return -1; 97 return -1;
97 98
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 868de3d5c39d..fa2161d5003b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,7 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
12 13
13#include "realmode/wakeup.h" 14#include "realmode/wakeup.h"
14#include "sleep.h" 15#include "sleep.h"
@@ -23,15 +24,6 @@ static unsigned long acpi_realmode;
23static char temp_stack[10240]; 24static char temp_stack[10240];
24#endif 25#endif
25 26
26/* XXX: this macro should move to asm-x86/segment.h and be shared with the
27 boot code... */
28#define GDT_ENTRY(flags, base, limit) \
29 (((u64)(base & 0xff000000) << 32) | \
30 ((u64)flags << 40) | \
31 ((u64)(limit & 0x00ff0000) << 32) | \
32 ((u64)(base & 0x00ffffff) << 16) | \
33 ((u64)(limit & 0x0000ffff)))
34
35/** 27/**
36 * acpi_save_state_mem - save kernel state 28 * acpi_save_state_mem - save kernel state
37 * 29 *
@@ -158,6 +150,10 @@ static int __init acpi_sleep_setup(char *str)
158 acpi_realmode_flags |= 2; 150 acpi_realmode_flags |= 2;
159 if (strncmp(str, "s3_beep", 7) == 0) 151 if (strncmp(str, "s3_beep", 7) == 0)
160 acpi_realmode_flags |= 4; 152 acpi_realmode_flags |= 4;
153#ifdef CONFIG_HIBERNATION
154 if (strncmp(str, "s4_nohwsig", 10) == 0)
155 acpi_no_s4_hw_signature();
156#endif
161 if (strncmp(str, "old_ordering", 12) == 0) 157 if (strncmp(str, "old_ordering", 12) == 0)
162 acpi_old_suspend_ordering(); 158 acpi_old_suspend_ordering();
163 str = strchr(str, ','); 159 str = strchr(str, ',');
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a0..74697408576f 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,7 +23,7 @@
23#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/gart.h> 26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h> 27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 28#include <asm/amd_iommu.h>
29 29
@@ -32,21 +32,37 @@
32#define to_pages(addr, size) \ 32#define to_pages(addr, size) \
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) 33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34 34
35#define EXIT_LOOP_COUNT 10000000
36
35static DEFINE_RWLOCK(amd_iommu_devtable_lock); 37static DEFINE_RWLOCK(amd_iommu_devtable_lock);
36 38
37struct command { 39/*
40 * general struct to manage commands send to an IOMMU
41 */
42struct iommu_cmd {
38 u32 data[4]; 43 u32 data[4];
39}; 44};
40 45
41static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 46static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
42 struct unity_map_entry *e); 47 struct unity_map_entry *e);
43 48
49/* returns !0 if the IOMMU is caching non-present entries in its TLB */
44static int iommu_has_npcache(struct amd_iommu *iommu) 50static int iommu_has_npcache(struct amd_iommu *iommu)
45{ 51{
46 return iommu->cap & IOMMU_CAP_NPCACHE; 52 return iommu->cap & IOMMU_CAP_NPCACHE;
47} 53}
48 54
49static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 55/****************************************************************************
56 *
57 * IOMMU command queuing functions
58 *
59 ****************************************************************************/
60
61/*
62 * Writes the command to the IOMMUs command buffer and informs the
63 * hardware about the new command. Must be called with iommu->lock held.
64 */
65static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
50{ 66{
51 u32 tail, head; 67 u32 tail, head;
52 u8 *target; 68 u8 *target;
@@ -63,7 +79,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
63 return 0; 79 return 0;
64} 80}
65 81
66static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 82/*
83 * General queuing function for commands. Takes iommu->lock and calls
84 * __iommu_queue_command().
85 */
86static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
67{ 87{
68 unsigned long flags; 88 unsigned long flags;
69 int ret; 89 int ret;
@@ -75,16 +95,24 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
75 return ret; 95 return ret;
76} 96}
77 97
98/*
99 * This function is called whenever we need to ensure that the IOMMU has
100 * completed execution of all commands we sent. It sends a
101 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
102 * us about that by writing a value to a physical address we pass with
103 * the command.
104 */
78static int iommu_completion_wait(struct amd_iommu *iommu) 105static int iommu_completion_wait(struct amd_iommu *iommu)
79{ 106{
80 int ret; 107 int ret;
81 struct command cmd; 108 struct iommu_cmd cmd;
82 volatile u64 ready = 0; 109 volatile u64 ready = 0;
83 unsigned long ready_phys = virt_to_phys(&ready); 110 unsigned long ready_phys = virt_to_phys(&ready);
111 unsigned long i = 0;
84 112
85 memset(&cmd, 0, sizeof(cmd)); 113 memset(&cmd, 0, sizeof(cmd));
86 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 114 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
87 cmd.data[1] = HIGH_U32(ready_phys); 115 cmd.data[1] = upper_32_bits(ready_phys);
88 cmd.data[2] = 1; /* value written to 'ready' */ 116 cmd.data[2] = 1; /* value written to 'ready' */
89 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 117 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
90 118
@@ -95,15 +123,23 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
95 if (ret) 123 if (ret)
96 return ret; 124 return ret;
97 125
98 while (!ready) 126 while (!ready && (i < EXIT_LOOP_COUNT)) {
127 ++i;
99 cpu_relax(); 128 cpu_relax();
129 }
130
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
100 133
101 return 0; 134 return 0;
102} 135}
103 136
137/*
138 * Command send function for invalidating a device table entry
139 */
104static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
105{ 141{
106 struct command cmd; 142 struct iommu_cmd cmd;
107 143
108 BUG_ON(iommu == NULL); 144 BUG_ON(iommu == NULL);
109 145
@@ -116,20 +152,23 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
116 return iommu_queue_command(iommu, &cmd); 152 return iommu_queue_command(iommu, &cmd);
117} 153}
118 154
155/*
156 * Generic command send function for invalidaing TLB entries
157 */
119static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 158static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
120 u64 address, u16 domid, int pde, int s) 159 u64 address, u16 domid, int pde, int s)
121{ 160{
122 struct command cmd; 161 struct iommu_cmd cmd;
123 162
124 memset(&cmd, 0, sizeof(cmd)); 163 memset(&cmd, 0, sizeof(cmd));
125 address &= PAGE_MASK; 164 address &= PAGE_MASK;
126 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 165 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
127 cmd.data[1] |= domid; 166 cmd.data[1] |= domid;
128 cmd.data[2] = LOW_U32(address); 167 cmd.data[2] = LOW_U32(address);
129 cmd.data[3] = HIGH_U32(address); 168 cmd.data[3] = upper_32_bits(address);
130 if (s) 169 if (s) /* size bit - we flush more than one 4kb page */
131 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 170 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
132 if (pde) 171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
133 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
134 173
135 iommu->need_sync = 1; 174 iommu->need_sync = 1;
@@ -137,6 +176,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
137 return iommu_queue_command(iommu, &cmd); 176 return iommu_queue_command(iommu, &cmd);
138} 177}
139 178
179/*
180 * TLB invalidation function which is called from the mapping functions.
181 * It invalidates a single PTE if the range to flush is within a single
182 * page. Otherwise it flushes the whole TLB of the IOMMU.
183 */
140static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 184static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
141 u64 address, size_t size) 185 u64 address, size_t size)
142{ 186{
@@ -159,6 +203,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
159 return 0; 203 return 0;
160} 204}
161 205
206/****************************************************************************
207 *
208 * The functions below are used the create the page table mappings for
209 * unity mapped regions.
210 *
211 ****************************************************************************/
212
213/*
214 * Generic mapping functions. It maps a physical address into a DMA
215 * address space. It allocates the page table pages if necessary.
216 * In the future it can be extended to a generic mapping function
217 * supporting all features of AMD IOMMU page tables like level skipping
218 * and full 64 bit address spaces.
219 */
162static int iommu_map(struct protection_domain *dom, 220static int iommu_map(struct protection_domain *dom,
163 unsigned long bus_addr, 221 unsigned long bus_addr,
164 unsigned long phys_addr, 222 unsigned long phys_addr,
@@ -209,6 +267,10 @@ static int iommu_map(struct protection_domain *dom,
209 return 0; 267 return 0;
210} 268}
211 269
270/*
271 * This function checks if a specific unity mapping entry is needed for
272 * this specific IOMMU.
273 */
212static int iommu_for_unity_map(struct amd_iommu *iommu, 274static int iommu_for_unity_map(struct amd_iommu *iommu,
213 struct unity_map_entry *entry) 275 struct unity_map_entry *entry)
214{ 276{
@@ -223,6 +285,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
223 return 0; 285 return 0;
224} 286}
225 287
288/*
289 * Init the unity mappings for a specific IOMMU in the system
290 *
291 * Basically iterates over all unity mapping entries and applies them to
292 * the default domain DMA of that IOMMU if necessary.
293 */
226static int iommu_init_unity_mappings(struct amd_iommu *iommu) 294static int iommu_init_unity_mappings(struct amd_iommu *iommu)
227{ 295{
228 struct unity_map_entry *entry; 296 struct unity_map_entry *entry;
@@ -239,6 +307,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
239 return 0; 307 return 0;
240} 308}
241 309
310/*
311 * This function actually applies the mapping to the page table of the
312 * dma_ops domain.
313 */
242static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 314static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
243 struct unity_map_entry *e) 315 struct unity_map_entry *e)
244{ 316{
@@ -261,6 +333,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
261 return 0; 333 return 0;
262} 334}
263 335
336/*
337 * Inits the unity mappings required for a specific device
338 */
264static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 339static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
265 u16 devid) 340 u16 devid)
266{ 341{
@@ -278,12 +353,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
278 return 0; 353 return 0;
279} 354}
280 355
356/****************************************************************************
357 *
358 * The next functions belong to the address allocator for the dma_ops
359 * interface functions. They work like the allocators in the other IOMMU
360 * drivers. Its basically a bitmap which marks the allocated pages in
361 * the aperture. Maybe it could be enhanced in the future to a more
362 * efficient allocator.
363 *
364 ****************************************************************************/
281static unsigned long dma_mask_to_pages(unsigned long mask) 365static unsigned long dma_mask_to_pages(unsigned long mask)
282{ 366{
283 return (mask >> PAGE_SHIFT) + 367 return (mask >> PAGE_SHIFT) +
284 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); 368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
285} 369}
286 370
371/*
372 * The address allocator core function.
373 *
374 * called with domain->lock held
375 */
287static unsigned long dma_ops_alloc_addresses(struct device *dev, 376static unsigned long dma_ops_alloc_addresses(struct device *dev,
288 struct dma_ops_domain *dom, 377 struct dma_ops_domain *dom,
289 unsigned int pages) 378 unsigned int pages)
@@ -317,6 +406,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
317 return address; 406 return address;
318} 407}
319 408
409/*
410 * The address free function.
411 *
412 * called with domain->lock held
413 */
320static void dma_ops_free_addresses(struct dma_ops_domain *dom, 414static void dma_ops_free_addresses(struct dma_ops_domain *dom,
321 unsigned long address, 415 unsigned long address,
322 unsigned int pages) 416 unsigned int pages)
@@ -325,6 +419,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
325 iommu_area_free(dom->bitmap, address, pages); 419 iommu_area_free(dom->bitmap, address, pages);
326} 420}
327 421
422/****************************************************************************
423 *
424 * The next functions belong to the domain allocation. A domain is
425 * allocated for every IOMMU as the default domain. If device isolation
426 * is enabled, every device get its own domain. The most important thing
427 * about domains is the page table mapping the DMA address space they
428 * contain.
429 *
430 ****************************************************************************/
431
328static u16 domain_id_alloc(void) 432static u16 domain_id_alloc(void)
329{ 433{
330 unsigned long flags; 434 unsigned long flags;
@@ -342,6 +446,10 @@ static u16 domain_id_alloc(void)
342 return id; 446 return id;
343} 447}
344 448
449/*
450 * Used to reserve address ranges in the aperture (e.g. for exclusion
451 * ranges.
452 */
345static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 453static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
346 unsigned long start_page, 454 unsigned long start_page,
347 unsigned int pages) 455 unsigned int pages)
@@ -382,6 +490,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
382 free_page((unsigned long)p1); 490 free_page((unsigned long)p1);
383} 491}
384 492
493/*
494 * Free a domain, only used if something went wrong in the
495 * allocation path and we need to free an already allocated page table
496 */
385static void dma_ops_domain_free(struct dma_ops_domain *dom) 497static void dma_ops_domain_free(struct dma_ops_domain *dom)
386{ 498{
387 if (!dom) 499 if (!dom)
@@ -396,6 +508,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
396 kfree(dom); 508 kfree(dom);
397} 509}
398 510
511/*
512 * Allocates a new protection domain usable for the dma_ops functions.
513 * It also intializes the page table and the address allocator data
514 * structures required for the dma_ops interface
515 */
399static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 516static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
400 unsigned order) 517 unsigned order)
401{ 518{
@@ -436,6 +553,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
436 dma_dom->bitmap[0] = 1; 553 dma_dom->bitmap[0] = 1;
437 dma_dom->next_bit = 0; 554 dma_dom->next_bit = 0;
438 555
556 /* Intialize the exclusion range if necessary */
439 if (iommu->exclusion_start && 557 if (iommu->exclusion_start &&
440 iommu->exclusion_start < dma_dom->aperture_size) { 558 iommu->exclusion_start < dma_dom->aperture_size) {
441 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
@@ -444,6 +562,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
444 dma_ops_reserve_addresses(dma_dom, startpage, pages); 562 dma_ops_reserve_addresses(dma_dom, startpage, pages);
445 } 563 }
446 564
565 /*
566 * At the last step, build the page tables so we don't need to
567 * allocate page table pages in the dma_ops mapping/unmapping
568 * path.
569 */
447 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 570 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
448 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 571 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
449 GFP_KERNEL); 572 GFP_KERNEL);
@@ -472,6 +595,10 @@ free_dma_dom:
472 return NULL; 595 return NULL;
473} 596}
474 597
598/*
599 * Find out the protection domain structure for a given PCI device. This
600 * will give us the pointer to the page table root for example.
601 */
475static struct protection_domain *domain_for_device(u16 devid) 602static struct protection_domain *domain_for_device(u16 devid)
476{ 603{
477 struct protection_domain *dom; 604 struct protection_domain *dom;
@@ -484,6 +611,10 @@ static struct protection_domain *domain_for_device(u16 devid)
484 return dom; 611 return dom;
485} 612}
486 613
614/*
615 * If a device is not yet associated with a domain, this function does
616 * assigns it visible for the hardware
617 */
487static void set_device_domain(struct amd_iommu *iommu, 618static void set_device_domain(struct amd_iommu *iommu,
488 struct protection_domain *domain, 619 struct protection_domain *domain,
489 u16 devid) 620 u16 devid)
@@ -508,6 +639,19 @@ static void set_device_domain(struct amd_iommu *iommu,
508 iommu->need_sync = 1; 639 iommu->need_sync = 1;
509} 640}
510 641
642/*****************************************************************************
643 *
644 * The next functions belong to the dma_ops mapping/unmapping code.
645 *
646 *****************************************************************************/
647
648/*
649 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device.
652 * If the device is not yet associated with a domain this is also done
653 * in this function.
654 */
511static int get_device_resources(struct device *dev, 655static int get_device_resources(struct device *dev,
512 struct amd_iommu **iommu, 656 struct amd_iommu **iommu,
513 struct protection_domain **domain, 657 struct protection_domain **domain,
@@ -520,9 +664,10 @@ static int get_device_resources(struct device *dev,
520 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
521 665
522 pcidev = to_pci_dev(dev); 666 pcidev = to_pci_dev(dev);
523 _bdf = (pcidev->bus->number << 8) | pcidev->devfn; 667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
524 668
525 if (_bdf >= amd_iommu_last_bdf) { 669 /* device not translated by any IOMMU in the system? */
670 if (_bdf > amd_iommu_last_bdf) {
526 *iommu = NULL; 671 *iommu = NULL;
527 *domain = NULL; 672 *domain = NULL;
528 *bdf = 0xffff; 673 *bdf = 0xffff;
@@ -547,6 +692,10 @@ static int get_device_resources(struct device *dev,
547 return 1; 692 return 1;
548} 693}
549 694
695/*
696 * This is the generic map function. It maps one 4kb page at paddr to
697 * the given address in the DMA address space for the domain.
698 */
550static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 699static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
551 struct dma_ops_domain *dom, 700 struct dma_ops_domain *dom,
552 unsigned long address, 701 unsigned long address,
@@ -578,6 +727,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
578 return (dma_addr_t)address; 727 return (dma_addr_t)address;
579} 728}
580 729
730/*
731 * The generic unmapping function for on page in the DMA address space.
732 */
581static void dma_ops_domain_unmap(struct amd_iommu *iommu, 733static void dma_ops_domain_unmap(struct amd_iommu *iommu,
582 struct dma_ops_domain *dom, 734 struct dma_ops_domain *dom,
583 unsigned long address) 735 unsigned long address)
@@ -597,6 +749,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
597 *pte = 0ULL; 749 *pte = 0ULL;
598} 750}
599 751
752/*
753 * This function contains common code for mapping of a physically
754 * contiguous memory region into DMA address space. It is uses by all
755 * mapping functions provided by this IOMMU driver.
756 * Must be called with the domain lock held.
757 */
600static dma_addr_t __map_single(struct device *dev, 758static dma_addr_t __map_single(struct device *dev,
601 struct amd_iommu *iommu, 759 struct amd_iommu *iommu,
602 struct dma_ops_domain *dma_dom, 760 struct dma_ops_domain *dma_dom,
@@ -628,6 +786,10 @@ out:
628 return address; 786 return address;
629} 787}
630 788
789/*
790 * Does the reverse of the __map_single function. Must be called with
791 * the domain lock held too
792 */
631static void __unmap_single(struct amd_iommu *iommu, 793static void __unmap_single(struct amd_iommu *iommu,
632 struct dma_ops_domain *dma_dom, 794 struct dma_ops_domain *dma_dom,
633 dma_addr_t dma_addr, 795 dma_addr_t dma_addr,
@@ -652,6 +814,9 @@ static void __unmap_single(struct amd_iommu *iommu,
652 dma_ops_free_addresses(dma_dom, dma_addr, pages); 814 dma_ops_free_addresses(dma_dom, dma_addr, pages);
653} 815}
654 816
817/*
818 * The exported map_single function for dma_ops.
819 */
655static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 820static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
656 size_t size, int dir) 821 size_t size, int dir)
657{ 822{
@@ -664,6 +829,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
664 get_device_resources(dev, &iommu, &domain, &devid); 829 get_device_resources(dev, &iommu, &domain, &devid);
665 830
666 if (iommu == NULL || domain == NULL) 831 if (iommu == NULL || domain == NULL)
832 /* device not handled by any AMD IOMMU */
667 return (dma_addr_t)paddr; 833 return (dma_addr_t)paddr;
668 834
669 spin_lock_irqsave(&domain->lock, flags); 835 spin_lock_irqsave(&domain->lock, flags);
@@ -683,6 +849,9 @@ out:
683 return addr; 849 return addr;
684} 850}
685 851
852/*
853 * The exported unmap_single function for dma_ops.
854 */
686static void unmap_single(struct device *dev, dma_addr_t dma_addr, 855static void unmap_single(struct device *dev, dma_addr_t dma_addr,
687 size_t size, int dir) 856 size_t size, int dir)
688{ 857{
@@ -692,6 +861,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
692 u16 devid; 861 u16 devid;
693 862
694 if (!get_device_resources(dev, &iommu, &domain, &devid)) 863 if (!get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */
695 return; 865 return;
696 866
697 spin_lock_irqsave(&domain->lock, flags); 867 spin_lock_irqsave(&domain->lock, flags);
@@ -706,6 +876,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
706 spin_unlock_irqrestore(&domain->lock, flags); 876 spin_unlock_irqrestore(&domain->lock, flags);
707} 877}
708 878
879/*
880 * This is a special map_sg function which is used if we should map a
881 * device which is not handled by an AMD IOMMU in the system.
882 */
709static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, 883static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
710 int nelems, int dir) 884 int nelems, int dir)
711{ 885{
@@ -720,6 +894,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
720 return nelems; 894 return nelems;
721} 895}
722 896
897/*
898 * The exported map_sg function for dma_ops (handles scatter-gather
899 * lists).
900 */
723static int map_sg(struct device *dev, struct scatterlist *sglist, 901static int map_sg(struct device *dev, struct scatterlist *sglist,
724 int nelems, int dir) 902 int nelems, int dir)
725{ 903{
@@ -775,6 +953,10 @@ unmap:
775 goto out; 953 goto out;
776} 954}
777 955
956/*
957 * The exported map_sg function for dma_ops (handles scatter-gather
958 * lists).
959 */
778static void unmap_sg(struct device *dev, struct scatterlist *sglist, 960static void unmap_sg(struct device *dev, struct scatterlist *sglist,
779 int nelems, int dir) 961 int nelems, int dir)
780{ 962{
@@ -804,6 +986,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
804 spin_unlock_irqrestore(&domain->lock, flags); 986 spin_unlock_irqrestore(&domain->lock, flags);
805} 987}
806 988
989/*
990 * The exported alloc_coherent function for dma_ops.
991 */
807static void *alloc_coherent(struct device *dev, size_t size, 992static void *alloc_coherent(struct device *dev, size_t size,
808 dma_addr_t *dma_addr, gfp_t flag) 993 dma_addr_t *dma_addr, gfp_t flag)
809{ 994{
@@ -851,6 +1036,11 @@ out:
851 return virt_addr; 1036 return virt_addr;
852} 1037}
853 1038
1039/*
1040 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */
854static void free_coherent(struct device *dev, size_t size, 1044static void free_coherent(struct device *dev, size_t size,
855 void *virt_addr, dma_addr_t dma_addr) 1045 void *virt_addr, dma_addr_t dma_addr)
856{ 1046{
@@ -879,6 +1069,8 @@ free_mem:
879} 1069}
880 1070
881/* 1071/*
1072 * The function for pre-allocating protection domains.
1073 *
882 * If the driver core informs the DMA layer if a driver grabs a device 1074 * If the driver core informs the DMA layer if a driver grabs a device
883 * we don't need to preallocate the protection domains anymore. 1075 * we don't need to preallocate the protection domains anymore.
884 * For now we have to. 1076 * For now we have to.
@@ -893,7 +1085,7 @@ void prealloc_protection_domains(void)
893 1085
894 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1086 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
895 devid = (dev->bus->number << 8) | dev->devfn; 1087 devid = (dev->bus->number << 8) | dev->devfn;
896 if (devid >= amd_iommu_last_bdf) 1088 if (devid > amd_iommu_last_bdf)
897 continue; 1089 continue;
898 devid = amd_iommu_alias_table[devid]; 1090 devid = amd_iommu_alias_table[devid];
899 if (domain_for_device(devid)) 1091 if (domain_for_device(devid))
@@ -921,12 +1113,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
921 .unmap_sg = unmap_sg, 1113 .unmap_sg = unmap_sg,
922}; 1114};
923 1115
1116/*
1117 * The function which clues the AMD IOMMU driver into dma_ops.
1118 */
924int __init amd_iommu_init_dma_ops(void) 1119int __init amd_iommu_init_dma_ops(void)
925{ 1120{
926 struct amd_iommu *iommu; 1121 struct amd_iommu *iommu;
927 int order = amd_iommu_aperture_order; 1122 int order = amd_iommu_aperture_order;
928 int ret; 1123 int ret;
929 1124
1125 /*
1126 * first allocate a default protection domain for every IOMMU we
1127 * found in the system. Devices not assigned to any other
1128 * protection domain will be assigned to the default one.
1129 */
930 list_for_each_entry(iommu, &amd_iommu_list, list) { 1130 list_for_each_entry(iommu, &amd_iommu_list, list) {
931 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1131 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
932 if (iommu->default_dom == NULL) 1132 if (iommu->default_dom == NULL)
@@ -936,6 +1136,10 @@ int __init amd_iommu_init_dma_ops(void)
936 goto free_domains; 1136 goto free_domains;
937 } 1137 }
938 1138
1139 /*
1140 * If device isolation is enabled, pre-allocate the protection
1141 * domains for each device.
1142 */
939 if (amd_iommu_isolate) 1143 if (amd_iommu_isolate)
940 prealloc_protection_domains(); 1144 prealloc_protection_domains();
941 1145
@@ -947,6 +1151,7 @@ int __init amd_iommu_init_dma_ops(void)
947 gart_iommu_aperture = 0; 1151 gart_iommu_aperture = 0;
948#endif 1152#endif
949 1153
1154 /* Make the driver finally visible to the drivers */
950 dma_ops = &amd_iommu_dma_ops; 1155 dma_ops = &amd_iommu_dma_ops;
951 1156
952 return 0; 1157 return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437d..d9a9da597e79 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -25,20 +25,13 @@
25#include <asm/pci-direct.h> 25#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 26#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 27#include <asm/amd_iommu.h>
28#include <asm/gart.h> 28#include <asm/iommu.h>
29 29
30/* 30/*
31 * definitions for the ACPI scanning code 31 * definitions for the ACPI scanning code
32 */ 32 */
33#define UPDATE_LAST_BDF(x) do {\
34 if ((x) > amd_iommu_last_bdf) \
35 amd_iommu_last_bdf = (x); \
36 } while (0);
37
38#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
39#define PCI_BUS(x) (((x) >> 8) & 0xff) 33#define PCI_BUS(x) (((x) >> 8) & 0xff)
40#define IVRS_HEADER_LENGTH 48 34#define IVRS_HEADER_LENGTH 48
41#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
42 35
43#define ACPI_IVHD_TYPE 0x10 36#define ACPI_IVHD_TYPE 0x10
44#define ACPI_IVMD_TYPE_ALL 0x20 37#define ACPI_IVMD_TYPE_ALL 0x20
@@ -71,6 +64,17 @@
71#define ACPI_DEVFLAG_LINT1 0x80 64#define ACPI_DEVFLAG_LINT1 0x80
72#define ACPI_DEVFLAG_ATSDIS 0x10000000 65#define ACPI_DEVFLAG_ATSDIS 0x10000000
73 66
67/*
68 * ACPI table definitions
69 *
70 * These data structures are laid over the table to parse the important values
71 * out of it.
72 */
73
74/*
75 * structure describing one IOMMU in the ACPI table. Typically followed by one
76 * or more ivhd_entrys.
77 */
74struct ivhd_header { 78struct ivhd_header {
75 u8 type; 79 u8 type;
76 u8 flags; 80 u8 flags;
@@ -83,6 +87,10 @@ struct ivhd_header {
83 u32 reserved; 87 u32 reserved;
84} __attribute__((packed)); 88} __attribute__((packed));
85 89
90/*
91 * A device entry describing which devices a specific IOMMU translates and
92 * which requestor ids they use.
93 */
86struct ivhd_entry { 94struct ivhd_entry {
87 u8 type; 95 u8 type;
88 u16 devid; 96 u16 devid;
@@ -90,6 +98,10 @@ struct ivhd_entry {
90 u32 ext; 98 u32 ext;
91} __attribute__((packed)); 99} __attribute__((packed));
92 100
101/*
102 * An AMD IOMMU memory definition structure. It defines things like exclusion
103 * ranges for devices and regions that should be unity mapped.
104 */
93struct ivmd_header { 105struct ivmd_header {
94 u8 type; 106 u8 type;
95 u8 flags; 107 u8 flags;
@@ -103,22 +115,80 @@ struct ivmd_header {
103 115
104static int __initdata amd_iommu_detected; 116static int __initdata amd_iommu_detected;
105 117
106u16 amd_iommu_last_bdf; 118u16 amd_iommu_last_bdf; /* largest PCI device id we have
107struct list_head amd_iommu_unity_map; 119 to handle */
108unsigned amd_iommu_aperture_order = 26; 120LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
109int amd_iommu_isolate; 121 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */
124
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */
110 127
111struct list_head amd_iommu_list; 128/*
129 * Pointer to the device table which is shared by all AMD IOMMUs
130 * it is indexed by the PCI device id or the HT unit id and contains
131 * information about the domain the device belongs to as well as the
132 * page table root pointer.
133 */
112struct dev_table_entry *amd_iommu_dev_table; 134struct dev_table_entry *amd_iommu_dev_table;
135
136/*
137 * The alias table is a driver specific data structure which contains the
138 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
139 * More than one device can share the same requestor id.
140 */
113u16 *amd_iommu_alias_table; 141u16 *amd_iommu_alias_table;
142
143/*
144 * The rlookup table is used to find the IOMMU which is responsible
145 * for a specific device. It is also indexed by the PCI device id.
146 */
114struct amd_iommu **amd_iommu_rlookup_table; 147struct amd_iommu **amd_iommu_rlookup_table;
148
149/*
150 * The pd table (protection domain table) is used to find the protection domain
151 * data structure a device belongs to. Indexed with the PCI device id too.
152 */
115struct protection_domain **amd_iommu_pd_table; 153struct protection_domain **amd_iommu_pd_table;
154
155/*
156 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
157 * to know which ones are already in use.
158 */
116unsigned long *amd_iommu_pd_alloc_bitmap; 159unsigned long *amd_iommu_pd_alloc_bitmap;
117 160
118static u32 dev_table_size; 161static u32 dev_table_size; /* size of the device table */
119static u32 alias_table_size; 162static u32 alias_table_size; /* size of the alias table */
120static u32 rlookup_table_size; 163static u32 rlookup_table_size; /* size if the rlookup table */
121 164
165static inline void update_last_devid(u16 devid)
166{
167 if (devid > amd_iommu_last_bdf)
168 amd_iommu_last_bdf = devid;
169}
170
171static inline unsigned long tbl_size(int entry_size)
172{
173 unsigned shift = PAGE_SHIFT +
174 get_order(amd_iommu_last_bdf * entry_size);
175
176 return 1UL << shift;
177}
178
179/****************************************************************************
180 *
181 * AMD IOMMU MMIO register space handling functions
182 *
183 * These functions are used to program the IOMMU device registers in
184 * MMIO space required for that driver.
185 *
186 ****************************************************************************/
187
188/*
189 * This function set the exclusion range in the IOMMU. DMA accesses to the
190 * exclusion range are passed through untranslated
191 */
122static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 192static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
123{ 193{
124 u64 start = iommu->exclusion_start & PAGE_MASK; 194 u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,6 +207,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
137 &entry, sizeof(entry)); 207 &entry, sizeof(entry));
138} 208}
139 209
210/* Programs the physical address of the device table into the IOMMU hardware */
140static void __init iommu_set_device_table(struct amd_iommu *iommu) 211static void __init iommu_set_device_table(struct amd_iommu *iommu)
141{ 212{
142 u32 entry; 213 u32 entry;
@@ -149,6 +220,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
149 &entry, sizeof(entry)); 220 &entry, sizeof(entry));
150} 221}
151 222
223/* Generic functions to enable/disable certain features of the IOMMU. */
152static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 224static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
153{ 225{
154 u32 ctrl; 226 u32 ctrl;
@@ -167,6 +239,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
167 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
168} 240}
169 241
242/* Function to enable the hardware */
170void __init iommu_enable(struct amd_iommu *iommu) 243void __init iommu_enable(struct amd_iommu *iommu)
171{ 244{
172 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
@@ -176,6 +249,10 @@ void __init iommu_enable(struct amd_iommu *iommu)
176 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
177} 250}
178 251
252/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one.
255 */
179static u8 * __init iommu_map_mmio_space(u64 address) 256static u8 * __init iommu_map_mmio_space(u64 address)
180{ 257{
181 u8 *ret; 258 u8 *ret;
@@ -199,16 +276,33 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
199 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); 276 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
200} 277}
201 278
279/****************************************************************************
280 *
281 * The functions below belong to the first pass of AMD IOMMU ACPI table
282 * parsing. In this pass we try to find out the highest device id this
283 * code has to handle. Upon this information the size of the shared data
284 * structures is determined later.
285 *
286 ****************************************************************************/
287
288/*
289 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU
291 */
202static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) 292static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
203{ 293{
204 u32 cap; 294 u32 cap;
205 295
206 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 296 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
207 UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); 297 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
208 298
209 return 0; 299 return 0;
210} 300}
211 301
302/*
303 * After reading the highest device id from the IOMMU PCI capability header
304 * this function looks if there is a higher device id defined in the ACPI table
305 */
212static int __init find_last_devid_from_ivhd(struct ivhd_header *h) 306static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
213{ 307{
214 u8 *p = (void *)h, *end = (void *)h; 308 u8 *p = (void *)h, *end = (void *)h;
@@ -229,7 +323,8 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
229 case IVHD_DEV_RANGE_END: 323 case IVHD_DEV_RANGE_END:
230 case IVHD_DEV_ALIAS: 324 case IVHD_DEV_ALIAS:
231 case IVHD_DEV_EXT_SELECT: 325 case IVHD_DEV_EXT_SELECT:
232 UPDATE_LAST_BDF(dev->devid); 326 /* all the above subfield types refer to device ids */
327 update_last_devid(dev->devid);
233 break; 328 break;
234 default: 329 default:
235 break; 330 break;
@@ -242,6 +337,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
242 return 0; 337 return 0;
243} 338}
244 339
340/*
341 * Iterate over all IVHD entries in the ACPI table and find the highest device
342 * id which we need to handle. This is the first of three functions which parse
343 * the ACPI table. So we check the checksum here.
344 */
245static int __init find_last_devid_acpi(struct acpi_table_header *table) 345static int __init find_last_devid_acpi(struct acpi_table_header *table)
246{ 346{
247 int i; 347 int i;
@@ -277,19 +377,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
277 return 0; 377 return 0;
278} 378}
279 379
380/****************************************************************************
381 *
382 * The following functions belong the the code path which parses the ACPI table
383 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
384 * data structures, initialize the device/alias/rlookup table and also
385 * basically initialize the hardware.
386 *
387 ****************************************************************************/
388
389/*
390 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
391 * write commands to that buffer later and the IOMMU will execute them
392 * asynchronously
393 */
280static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) 394static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
281{ 395{
282 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, 396 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
283 get_order(CMD_BUFFER_SIZE)); 397 get_order(CMD_BUFFER_SIZE));
284 u64 entry = 0; 398 u64 entry;
285 399
286 if (cmd_buf == NULL) 400 if (cmd_buf == NULL)
287 return NULL; 401 return NULL;
288 402
289 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 403 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
290 404
291 memset(cmd_buf, 0, CMD_BUFFER_SIZE);
292
293 entry = (u64)virt_to_phys(cmd_buf); 405 entry = (u64)virt_to_phys(cmd_buf);
294 entry |= MMIO_CMD_SIZE_512; 406 entry |= MMIO_CMD_SIZE_512;
295 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 407 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +414,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
302 414
303static void __init free_command_buffer(struct amd_iommu *iommu) 415static void __init free_command_buffer(struct amd_iommu *iommu)
304{ 416{
305 if (iommu->cmd_buf) 417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE));
306 free_pages((unsigned long)iommu->cmd_buf,
307 get_order(CMD_BUFFER_SIZE));
308} 418}
309 419
420/* sets a specific bit in the device table entry. */
310static void set_dev_entry_bit(u16 devid, u8 bit) 421static void set_dev_entry_bit(u16 devid, u8 bit)
311{ 422{
312 int i = (bit >> 5) & 0x07; 423 int i = (bit >> 5) & 0x07;
@@ -315,7 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
315 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 426 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
316} 427}
317 428
318static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) 429/* Writes the specific IOMMU for a device into the rlookup table */
430static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
431{
432 amd_iommu_rlookup_table[devid] = iommu;
433}
434
435/*
436 * This function takes the device specific flags read from the ACPI
437 * table and sets up the device table entry with that information
438 */
439static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
440 u16 devid, u32 flags, u32 ext_flags)
319{ 441{
320 if (flags & ACPI_DEVFLAG_INITPASS) 442 if (flags & ACPI_DEVFLAG_INITPASS)
321 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); 443 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +453,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
331 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); 453 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
332 if (flags & ACPI_DEVFLAG_LINT1) 454 if (flags & ACPI_DEVFLAG_LINT1)
333 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 455 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
334}
335 456
336static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 457 set_iommu_for_device(iommu, devid);
337{
338 amd_iommu_rlookup_table[devid] = iommu;
339} 458}
340 459
460/*
461 * Reads the device exclusion range from ACPI and initialize IOMMU with
462 * it
463 */
341static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) 464static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
342{ 465{
343 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 466 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,12 +469,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
346 return; 469 return;
347 470
348 if (iommu) { 471 if (iommu) {
472 /*
473 * We only can configure exclusion ranges per IOMMU, not
474 * per device. But we can enable the exclusion range per
475 * device. This is done here
476 */
349 set_dev_entry_bit(m->devid, DEV_ENTRY_EX); 477 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
350 iommu->exclusion_start = m->range_start; 478 iommu->exclusion_start = m->range_start;
351 iommu->exclusion_length = m->range_length; 479 iommu->exclusion_length = m->range_length;
352 } 480 }
353} 481}
354 482
483/*
484 * This function reads some important data from the IOMMU PCI space and
485 * initializes the driver data structure with it. It reads the hardware
486 * capabilities and the first/last device entries
487 */
355static void __init init_iommu_from_pci(struct amd_iommu *iommu) 488static void __init init_iommu_from_pci(struct amd_iommu *iommu)
356{ 489{
357 int bus = PCI_BUS(iommu->devid); 490 int bus = PCI_BUS(iommu->devid);
@@ -363,10 +496,16 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
363 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
364 497
365 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
366 iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); 499 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
367 iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); 500 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range));
368} 503}
369 504
505/*
506 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
507 * initializes the hardware and our data structures with it.
508 */
370static void __init init_iommu_from_acpi(struct amd_iommu *iommu, 509static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
371 struct ivhd_header *h) 510 struct ivhd_header *h)
372{ 511{
@@ -374,7 +513,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
374 u8 *end = p, flags = 0; 513 u8 *end = p, flags = 0;
375 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 514 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
376 u32 ext_flags = 0; 515 u32 ext_flags = 0;
377 bool alias = 0; 516 bool alias = false;
378 struct ivhd_entry *e; 517 struct ivhd_entry *e;
379 518
380 /* 519 /*
@@ -414,22 +553,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
414 case IVHD_DEV_ALL: 553 case IVHD_DEV_ALL:
415 for (dev_i = iommu->first_device; 554 for (dev_i = iommu->first_device;
416 dev_i <= iommu->last_device; ++dev_i) 555 dev_i <= iommu->last_device; ++dev_i)
417 set_dev_entry_from_acpi(dev_i, e->flags, 0); 556 set_dev_entry_from_acpi(iommu, dev_i,
557 e->flags, 0);
418 break; 558 break;
419 case IVHD_DEV_SELECT: 559 case IVHD_DEV_SELECT:
420 devid = e->devid; 560 devid = e->devid;
421 set_dev_entry_from_acpi(devid, e->flags, 0); 561 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
422 break; 562 break;
423 case IVHD_DEV_SELECT_RANGE_START: 563 case IVHD_DEV_SELECT_RANGE_START:
424 devid_start = e->devid; 564 devid_start = e->devid;
425 flags = e->flags; 565 flags = e->flags;
426 ext_flags = 0; 566 ext_flags = 0;
427 alias = 0; 567 alias = false;
428 break; 568 break;
429 case IVHD_DEV_ALIAS: 569 case IVHD_DEV_ALIAS:
430 devid = e->devid; 570 devid = e->devid;
431 devid_to = e->ext >> 8; 571 devid_to = e->ext >> 8;
432 set_dev_entry_from_acpi(devid, e->flags, 0); 572 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
433 amd_iommu_alias_table[devid] = devid_to; 573 amd_iommu_alias_table[devid] = devid_to;
434 break; 574 break;
435 case IVHD_DEV_ALIAS_RANGE: 575 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +577,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
437 flags = e->flags; 577 flags = e->flags;
438 devid_to = e->ext >> 8; 578 devid_to = e->ext >> 8;
439 ext_flags = 0; 579 ext_flags = 0;
440 alias = 1; 580 alias = true;
441 break; 581 break;
442 case IVHD_DEV_EXT_SELECT: 582 case IVHD_DEV_EXT_SELECT:
443 devid = e->devid; 583 devid = e->devid;
444 set_dev_entry_from_acpi(devid, e->flags, e->ext); 584 set_dev_entry_from_acpi(iommu, devid, e->flags,
585 e->ext);
445 break; 586 break;
446 case IVHD_DEV_EXT_SELECT_RANGE: 587 case IVHD_DEV_EXT_SELECT_RANGE:
447 devid_start = e->devid; 588 devid_start = e->devid;
448 flags = e->flags; 589 flags = e->flags;
449 ext_flags = e->ext; 590 ext_flags = e->ext;
450 alias = 0; 591 alias = false;
451 break; 592 break;
452 case IVHD_DEV_RANGE_END: 593 case IVHD_DEV_RANGE_END:
453 devid = e->devid; 594 devid = e->devid;
454 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 595 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
455 if (alias) 596 if (alias)
456 amd_iommu_alias_table[dev_i] = devid_to; 597 amd_iommu_alias_table[dev_i] = devid_to;
457 set_dev_entry_from_acpi( 598 set_dev_entry_from_acpi(iommu,
458 amd_iommu_alias_table[dev_i], 599 amd_iommu_alias_table[dev_i],
459 flags, ext_flags); 600 flags, ext_flags);
460 } 601 }
@@ -467,6 +608,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
467 } 608 }
468} 609}
469 610
611/* Initializes the device->iommu mapping for the driver */
470static int __init init_iommu_devices(struct amd_iommu *iommu) 612static int __init init_iommu_devices(struct amd_iommu *iommu)
471{ 613{
472 u16 i; 614 u16 i;
@@ -494,6 +636,11 @@ static void __init free_iommu_all(void)
494 } 636 }
495} 637}
496 638
639/*
640 * This function clues the initialization function for one IOMMU
641 * together and also allocates the command buffer and programs the
642 * hardware. It does NOT enable the IOMMU. This is done afterwards.
643 */
497static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 644static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
498{ 645{
499 spin_lock_init(&iommu->lock); 646 spin_lock_init(&iommu->lock);
@@ -521,6 +668,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
521 return 0; 668 return 0;
522} 669}
523 670
671/*
672 * Iterates over all IOMMU entries in the ACPI table, allocates the
673 * IOMMU structure and initializes it with init_iommu_one()
674 */
524static int __init init_iommu_all(struct acpi_table_header *table) 675static int __init init_iommu_all(struct acpi_table_header *table)
525{ 676{
526 u8 *p = (u8 *)table, *end = (u8 *)table; 677 u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +679,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
528 struct amd_iommu *iommu; 679 struct amd_iommu *iommu;
529 int ret; 680 int ret;
530 681
531 INIT_LIST_HEAD(&amd_iommu_list);
532
533 end += table->length; 682 end += table->length;
534 p += IVRS_HEADER_LENGTH; 683 p += IVRS_HEADER_LENGTH;
535 684
@@ -555,6 +704,14 @@ static int __init init_iommu_all(struct acpi_table_header *table)
555 return 0; 704 return 0;
556} 705}
557 706
707/****************************************************************************
708 *
709 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges).
712 *
713 ****************************************************************************/
714
558static void __init free_unity_maps(void) 715static void __init free_unity_maps(void)
559{ 716{
560 struct unity_map_entry *entry, *next; 717 struct unity_map_entry *entry, *next;
@@ -565,6 +722,7 @@ static void __init free_unity_maps(void)
565 } 722 }
566} 723}
567 724
725/* called when we find an exclusion range definition in ACPI */
568static int __init init_exclusion_range(struct ivmd_header *m) 726static int __init init_exclusion_range(struct ivmd_header *m)
569{ 727{
570 int i; 728 int i;
@@ -574,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
574 set_device_exclusion_range(m->devid, m); 732 set_device_exclusion_range(m->devid, m);
575 break; 733 break;
576 case ACPI_IVMD_TYPE_ALL: 734 case ACPI_IVMD_TYPE_ALL:
577 for (i = 0; i < amd_iommu_last_bdf; ++i) 735 for (i = 0; i <= amd_iommu_last_bdf; ++i)
578 set_device_exclusion_range(i, m); 736 set_device_exclusion_range(i, m);
579 break; 737 break;
580 case ACPI_IVMD_TYPE_RANGE: 738 case ACPI_IVMD_TYPE_RANGE:
@@ -588,6 +746,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
588 return 0; 746 return 0;
589} 747}
590 748
749/* called for unity map ACPI definition */
591static int __init init_unity_map_range(struct ivmd_header *m) 750static int __init init_unity_map_range(struct ivmd_header *m)
592{ 751{
593 struct unity_map_entry *e = 0; 752 struct unity_map_entry *e = 0;
@@ -619,13 +778,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
619 return 0; 778 return 0;
620} 779}
621 780
781/* iterates over all memory definitions we find in the ACPI table */
622static int __init init_memory_definitions(struct acpi_table_header *table) 782static int __init init_memory_definitions(struct acpi_table_header *table)
623{ 783{
624 u8 *p = (u8 *)table, *end = (u8 *)table; 784 u8 *p = (u8 *)table, *end = (u8 *)table;
625 struct ivmd_header *m; 785 struct ivmd_header *m;
626 786
627 INIT_LIST_HEAD(&amd_iommu_unity_map);
628
629 end += table->length; 787 end += table->length;
630 p += IVRS_HEADER_LENGTH; 788 p += IVRS_HEADER_LENGTH;
631 789
@@ -642,6 +800,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
642 return 0; 800 return 0;
643} 801}
644 802
803/*
804 * This function finally enables all IOMMUs found in the system after
805 * they have been initialized
806 */
645static void __init enable_iommus(void) 807static void __init enable_iommus(void)
646{ 808{
647 struct amd_iommu *iommu; 809 struct amd_iommu *iommu;
@@ -678,6 +840,34 @@ static struct sys_device device_amd_iommu = {
678 .cls = &amd_iommu_sysdev_class, 840 .cls = &amd_iommu_sysdev_class,
679}; 841};
680 842
843/*
844 * This is the core init function for AMD IOMMU hardware in the system.
845 * This function is called from the generic x86 DMA layer initialization
846 * code.
847 *
848 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
849 * three times:
850 *
851 * 1 pass) Find the highest PCI device id the driver has to handle.
852 * Upon this information the size of the data structures is
853 * determined that needs to be allocated.
854 *
855 * 2 pass) Initialize the data structures just allocated with the
856 * information in the ACPI table about available AMD IOMMUs
857 * in the system. It also maps the PCI devices in the
858 * system to specific IOMMUs
859 *
860 * 3 pass) After the basic data structures are allocated and
861 * initialized we update them with information about memory
862 * remapping requirements parsed out of the ACPI table in
863 * this last pass.
864 *
865 * After that the hardware is initialized and ready to go. In the last
866 * step we do some Linux specific things like registering the driver in
867 * the dma_ops interface and initializing the suspend/resume support
868 * functions. Finally it prints some information about AMD IOMMUs and
869 * the driver state and enables the hardware.
870 */
681int __init amd_iommu_init(void) 871int __init amd_iommu_init(void)
682{ 872{
683 int i, ret = 0; 873 int i, ret = 0;
@@ -699,14 +889,14 @@ int __init amd_iommu_init(void)
699 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 889 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
700 return -ENODEV; 890 return -ENODEV;
701 891
702 dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); 892 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
703 alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); 893 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
704 rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); 894 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
705 895
706 ret = -ENOMEM; 896 ret = -ENOMEM;
707 897
708 /* Device table - directly used by all IOMMUs */ 898 /* Device table - directly used by all IOMMUs */
709 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, 899 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
710 get_order(dev_table_size)); 900 get_order(dev_table_size));
711 if (amd_iommu_dev_table == NULL) 901 if (amd_iommu_dev_table == NULL)
712 goto out; 902 goto out;
@@ -730,27 +920,23 @@ int __init amd_iommu_init(void)
730 * Protection Domain table - maps devices to protection domains 920 * Protection Domain table - maps devices to protection domains
731 * This table has the same size as the rlookup_table 921 * This table has the same size as the rlookup_table
732 */ 922 */
733 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, 923 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
734 get_order(rlookup_table_size)); 924 get_order(rlookup_table_size));
735 if (amd_iommu_pd_table == NULL) 925 if (amd_iommu_pd_table == NULL)
736 goto free; 926 goto free;
737 927
738 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, 928 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
929 GFP_KERNEL | __GFP_ZERO,
739 get_order(MAX_DOMAIN_ID/8)); 930 get_order(MAX_DOMAIN_ID/8));
740 if (amd_iommu_pd_alloc_bitmap == NULL) 931 if (amd_iommu_pd_alloc_bitmap == NULL)
741 goto free; 932 goto free;
742 933
743 /* 934 /*
744 * memory is allocated now; initialize the device table with all zeroes 935 * let all alias entries point to itself
745 * and let all alias entries point to itself
746 */ 936 */
747 memset(amd_iommu_dev_table, 0, dev_table_size); 937 for (i = 0; i <= amd_iommu_last_bdf; ++i)
748 for (i = 0; i < amd_iommu_last_bdf; ++i)
749 amd_iommu_alias_table[i] = i; 938 amd_iommu_alias_table[i] = i;
750 939
751 memset(amd_iommu_pd_table, 0, rlookup_table_size);
752 memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
753
754 /* 940 /*
755 * never allocate domain 0 because its used as the non-allocated and 941 * never allocate domain 0 because its used as the non-allocated and
756 * error value placeholder 942 * error value placeholder
@@ -795,24 +981,19 @@ out:
795 return ret; 981 return ret;
796 982
797free: 983free:
798 if (amd_iommu_pd_alloc_bitmap) 984 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
799 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
800 985
801 if (amd_iommu_pd_table) 986 free_pages((unsigned long)amd_iommu_pd_table,
802 free_pages((unsigned long)amd_iommu_pd_table, 987 get_order(rlookup_table_size));
803 get_order(rlookup_table_size));
804 988
805 if (amd_iommu_rlookup_table) 989 free_pages((unsigned long)amd_iommu_rlookup_table,
806 free_pages((unsigned long)amd_iommu_rlookup_table, 990 get_order(rlookup_table_size));
807 get_order(rlookup_table_size));
808 991
809 if (amd_iommu_alias_table) 992 free_pages((unsigned long)amd_iommu_alias_table,
810 free_pages((unsigned long)amd_iommu_alias_table, 993 get_order(alias_table_size));
811 get_order(alias_table_size));
812 994
813 if (amd_iommu_dev_table) 995 free_pages((unsigned long)amd_iommu_dev_table,
814 free_pages((unsigned long)amd_iommu_dev_table, 996 get_order(dev_table_size));
815 get_order(dev_table_size));
816 997
817 free_iommu_all(); 998 free_iommu_all();
818 999
@@ -821,6 +1002,13 @@ free:
821 goto out; 1002 goto out;
822} 1003}
823 1004
1005/****************************************************************************
1006 *
1007 * Early detect code. This code runs at IOMMU detection time in the DMA
1008 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1009 * IOMMUs
1010 *
1011 ****************************************************************************/
824static int __init early_amd_iommu_detect(struct acpi_table_header *table) 1012static int __init early_amd_iommu_detect(struct acpi_table_header *table)
825{ 1013{
826 return 0; 1014 return 0;
@@ -828,7 +1016,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
828 1016
829void __init amd_iommu_detect(void) 1017void __init amd_iommu_detect(void)
830{ 1018{
831 if (swiotlb || no_iommu || iommu_detected) 1019 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
832 return; 1020 return;
833 1021
834 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1022 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,6 +1029,13 @@ void __init amd_iommu_detect(void)
841 } 1029 }
842} 1030}
843 1031
1032/****************************************************************************
1033 *
1034 * Parsing functions for the AMD IOMMU specific kernel command line
1035 * options.
1036 *
1037 ****************************************************************************/
1038
844static int __init parse_amd_iommu_options(char *str) 1039static int __init parse_amd_iommu_options(char *str)
845{ 1040{
846 for (; *str; ++str) { 1041 for (; *str; ++str) {
@@ -853,20 +1048,10 @@ static int __init parse_amd_iommu_options(char *str)
853 1048
854static int __init parse_amd_iommu_size_options(char *str) 1049static int __init parse_amd_iommu_size_options(char *str)
855{ 1050{
856 for (; *str; ++str) { 1051 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
857 if (strcmp(str, "32M") == 0) 1052
858 amd_iommu_aperture_order = 25; 1053 if ((order > 24) && (order < 31))
859 if (strcmp(str, "64M") == 0) 1054 amd_iommu_aperture_order = order;
860 amd_iommu_aperture_order = 26;
861 if (strcmp(str, "128M") == 0)
862 amd_iommu_aperture_order = 27;
863 if (strcmp(str, "256M") == 0)
864 amd_iommu_aperture_order = 28;
865 if (strcmp(str, "512M") == 0)
866 amd_iommu_aperture_order = 29;
867 if (strcmp(str, "1G") == 0)
868 amd_iommu_aperture_order = 30;
869 }
870 1055
871 return 1; 1056 return 1;
872} 1057}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a5..44e21826db11 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index a437d027f20b..d6c898358371 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
75/* 75/*
76 * Debug level, exported for io_apic.c 76 * Debug level, exported for io_apic.c
77 */ 77 */
78int apic_verbosity; 78unsigned int apic_verbosity;
79 79
80int pic_mode; 80int pic_mode;
81 81
@@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void)
177 /* Level triggered for 82489DX */ 177 /* Level triggered for 82489DX */
178 if (!lapic_is_integrated()) 178 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 179 v |= APIC_LVT_LEVEL_TRIGGER;
180 apic_write_around(APIC_LVT0, v); 180 apic_write(APIC_LVT0, v);
181} 181}
182 182
183/** 183/**
@@ -212,9 +212,6 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 212 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 213 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 214 * call this function only once, with the real, calibrated value.
215 *
216 * We do reads before writes even if unnecessary, to get around the
217 * P5 APIC double write bug.
218 */ 215 */
219static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
220{ 217{
@@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
229 if (!irqen) 226 if (!irqen)
230 lvtt_value |= APIC_LVT_MASKED; 227 lvtt_value |= APIC_LVT_MASKED;
231 228
232 apic_write_around(APIC_LVTT, lvtt_value); 229 apic_write(APIC_LVTT, lvtt_value);
233 230
234 /* 231 /*
235 * Divide PICLK by 16 232 * Divide PICLK by 16
236 */ 233 */
237 tmp_value = apic_read(APIC_TDCR); 234 tmp_value = apic_read(APIC_TDCR);
238 apic_write_around(APIC_TDCR, (tmp_value 235 apic_write(APIC_TDCR,
239 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
240 | APIC_TDR_DIV_16); 237 APIC_TDR_DIV_16);
241 238
242 if (!oneshot) 239 if (!oneshot)
243 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
244} 241}
245 242
246/* 243/*
@@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
249static int lapic_next_event(unsigned long delta, 246static int lapic_next_event(unsigned long delta,
250 struct clock_event_device *evt) 247 struct clock_event_device *evt)
251{ 248{
252 apic_write_around(APIC_TMICT, delta); 249 apic_write(APIC_TMICT, delta);
253 return 0; 250 return 0;
254} 251}
255 252
@@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
278 case CLOCK_EVT_MODE_SHUTDOWN: 275 case CLOCK_EVT_MODE_SHUTDOWN:
279 v = apic_read(APIC_LVTT); 276 v = apic_read(APIC_LVTT);
280 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 277 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
281 apic_write_around(APIC_LVTT, v); 278 apic_write(APIC_LVTT, v);
282 break; 279 break;
283 case CLOCK_EVT_MODE_RESUME: 280 case CLOCK_EVT_MODE_RESUME:
284 /* Nothing to do here */ 281 /* Nothing to do here */
@@ -372,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
372 } 369 }
373} 370}
374 371
375/* 372static int __init calibrate_APIC_clock(void)
376 * Setup the boot APIC
377 *
378 * Calibrate and verify the result.
379 */
380void __init setup_boot_APIC_clock(void)
381{ 373{
382 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 374 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
383 const long pm_100ms = PMTMR_TICKS_PER_SEC/10; 375 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
@@ -387,24 +379,6 @@ void __init setup_boot_APIC_clock(void)
387 long delta, deltapm; 379 long delta, deltapm;
388 int pm_referenced = 0; 380 int pm_referenced = 0;
389 381
390 /*
391 * The local apic timer can be disabled via the kernel
392 * commandline or from the CPU detection code. Register the lapic
393 * timer as a dummy clock event source on SMP systems, so the
394 * broadcast mechanism is used. On UP systems simply ignore it.
395 */
396 if (local_apic_timer_disabled) {
397 /* No broadcast on UP ! */
398 if (num_possible_cpus() > 1) {
399 lapic_clockevent.mult = 1;
400 setup_APIC_timer();
401 }
402 return;
403 }
404
405 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
406 "calibrating APIC timer ...\n");
407
408 local_irq_disable(); 382 local_irq_disable();
409 383
410 /* Replace the global interrupt handler */ 384 /* Replace the global interrupt handler */
@@ -489,8 +463,6 @@ void __init setup_boot_APIC_clock(void)
489 calibration_result / (1000000 / HZ), 463 calibration_result / (1000000 / HZ),
490 calibration_result % (1000000 / HZ)); 464 calibration_result % (1000000 / HZ));
491 465
492 local_apic_timer_verify_ok = 1;
493
494 /* 466 /*
495 * Do a sanity check on the APIC calibration result 467 * Do a sanity check on the APIC calibration result
496 */ 468 */
@@ -498,12 +470,11 @@ void __init setup_boot_APIC_clock(void)
498 local_irq_enable(); 470 local_irq_enable();
499 printk(KERN_WARNING 471 printk(KERN_WARNING
500 "APIC frequency too slow, disabling apic timer\n"); 472 "APIC frequency too slow, disabling apic timer\n");
501 /* No broadcast on UP ! */ 473 return -1;
502 if (num_possible_cpus() > 1)
503 setup_APIC_timer();
504 return;
505 } 474 }
506 475
476 local_apic_timer_verify_ok = 1;
477
507 /* We trust the pm timer based calibration */ 478 /* We trust the pm timer based calibration */
508 if (!pm_referenced) { 479 if (!pm_referenced) {
509 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 480 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -543,22 +514,55 @@ void __init setup_boot_APIC_clock(void)
543 if (!local_apic_timer_verify_ok) { 514 if (!local_apic_timer_verify_ok) {
544 printk(KERN_WARNING 515 printk(KERN_WARNING
545 "APIC timer disabled due to verification failure.\n"); 516 "APIC timer disabled due to verification failure.\n");
517 return -1;
518 }
519
520 return 0;
521}
522
523/*
524 * Setup the boot APIC
525 *
526 * Calibrate and verify the result.
527 */
528void __init setup_boot_APIC_clock(void)
529{
530 /*
531 * The local apic timer can be disabled via the kernel
532 * commandline or from the CPU detection code. Register the lapic
533 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it.
535 */
536 if (local_apic_timer_disabled) {
546 /* No broadcast on UP ! */ 537 /* No broadcast on UP ! */
547 if (num_possible_cpus() == 1) 538 if (num_possible_cpus() > 1) {
548 return; 539 lapic_clockevent.mult = 1;
549 } else { 540 setup_APIC_timer();
550 /* 541 }
551 * If nmi_watchdog is set to IO_APIC, we need the 542 return;
552 * PIT/HPET going. Otherwise register lapic as a dummy
553 * device.
554 */
555 if (nmi_watchdog != NMI_IO_APIC)
556 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
557 else
558 printk(KERN_WARNING "APIC timer registered as dummy,"
559 " due to nmi_watchdog=%d!\n", nmi_watchdog);
560 } 543 }
561 544
545 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
546 "calibrating APIC timer ...\n");
547
548 if (calibrate_APIC_clock()) {
549 /* No broadcast on UP ! */
550 if (num_possible_cpus() > 1)
551 setup_APIC_timer();
552 return;
553 }
554
555 /*
556 * If nmi_watchdog is set to IO_APIC, we need the
557 * PIT/HPET going. Otherwise register lapic as a dummy
558 * device.
559 */
560 if (nmi_watchdog != NMI_IO_APIC)
561 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
562 else
563 printk(KERN_WARNING "APIC timer registered as dummy,"
564 " due to nmi_watchdog=%d!\n", nmi_watchdog);
565
562 /* Setup the lapic or request the broadcast */ 566 /* Setup the lapic or request the broadcast */
563 setup_APIC_timer(); 567 setup_APIC_timer();
564} 568}
@@ -693,44 +697,44 @@ void clear_local_APIC(void)
693 */ 697 */
694 if (maxlvt >= 3) { 698 if (maxlvt >= 3) {
695 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 699 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
696 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 700 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
697 } 701 }
698 /* 702 /*
699 * Careful: we have to set masks only first to deassert 703 * Careful: we have to set masks only first to deassert
700 * any level-triggered sources. 704 * any level-triggered sources.
701 */ 705 */
702 v = apic_read(APIC_LVTT); 706 v = apic_read(APIC_LVTT);
703 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 707 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
704 v = apic_read(APIC_LVT0); 708 v = apic_read(APIC_LVT0);
705 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 709 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
706 v = apic_read(APIC_LVT1); 710 v = apic_read(APIC_LVT1);
707 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 711 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
708 if (maxlvt >= 4) { 712 if (maxlvt >= 4) {
709 v = apic_read(APIC_LVTPC); 713 v = apic_read(APIC_LVTPC);
710 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 714 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
711 } 715 }
712 716
713 /* lets not touch this if we didn't frob it */ 717 /* lets not touch this if we didn't frob it */
714#ifdef CONFIG_X86_MCE_P4THERMAL 718#ifdef CONFIG_X86_MCE_P4THERMAL
715 if (maxlvt >= 5) { 719 if (maxlvt >= 5) {
716 v = apic_read(APIC_LVTTHMR); 720 v = apic_read(APIC_LVTTHMR);
717 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
718 } 722 }
719#endif 723#endif
720 /* 724 /*
721 * Clean APIC state for other OSs: 725 * Clean APIC state for other OSs:
722 */ 726 */
723 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 727 apic_write(APIC_LVTT, APIC_LVT_MASKED);
724 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 728 apic_write(APIC_LVT0, APIC_LVT_MASKED);
725 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 729 apic_write(APIC_LVT1, APIC_LVT_MASKED);
726 if (maxlvt >= 3) 730 if (maxlvt >= 3)
727 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 731 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
728 if (maxlvt >= 4) 732 if (maxlvt >= 4)
729 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 733 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
730 734
731#ifdef CONFIG_X86_MCE_P4THERMAL 735#ifdef CONFIG_X86_MCE_P4THERMAL
732 if (maxlvt >= 5) 736 if (maxlvt >= 5)
733 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); 737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
734#endif 738#endif
735 /* Integrated APIC (!82489DX) ? */ 739 /* Integrated APIC (!82489DX) ? */
736 if (lapic_is_integrated()) { 740 if (lapic_is_integrated()) {
@@ -756,7 +760,7 @@ void disable_local_APIC(void)
756 */ 760 */
757 value = apic_read(APIC_SPIV); 761 value = apic_read(APIC_SPIV);
758 value &= ~APIC_SPIV_APIC_ENABLED; 762 value &= ~APIC_SPIV_APIC_ENABLED;
759 apic_write_around(APIC_SPIV, value); 763 apic_write(APIC_SPIV, value);
760 764
761 /* 765 /*
762 * When LAPIC was disabled by the BIOS and enabled by the kernel, 766 * When LAPIC was disabled by the BIOS and enabled by the kernel,
@@ -865,8 +869,8 @@ void __init sync_Arb_IDs(void)
865 apic_wait_icr_idle(); 869 apic_wait_icr_idle();
866 870
867 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
868 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 872 apic_write(APIC_ICR,
869 | APIC_DM_INIT); 873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT);
870} 874}
871 875
872/* 876/*
@@ -902,16 +906,16 @@ void __init init_bsp_APIC(void)
902 else 906 else
903 value |= APIC_SPIV_FOCUS_DISABLED; 907 value |= APIC_SPIV_FOCUS_DISABLED;
904 value |= SPURIOUS_APIC_VECTOR; 908 value |= SPURIOUS_APIC_VECTOR;
905 apic_write_around(APIC_SPIV, value); 909 apic_write(APIC_SPIV, value);
906 910
907 /* 911 /*
908 * Set up the virtual wire mode. 912 * Set up the virtual wire mode.
909 */ 913 */
910 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 914 apic_write(APIC_LVT0, APIC_DM_EXTINT);
911 value = APIC_DM_NMI; 915 value = APIC_DM_NMI;
912 if (!lapic_is_integrated()) /* 82489DX */ 916 if (!lapic_is_integrated()) /* 82489DX */
913 value |= APIC_LVT_LEVEL_TRIGGER; 917 value |= APIC_LVT_LEVEL_TRIGGER;
914 apic_write_around(APIC_LVT1, value); 918 apic_write(APIC_LVT1, value);
915} 919}
916 920
917static void __cpuinit lapic_setup_esr(void) 921static void __cpuinit lapic_setup_esr(void)
@@ -926,7 +930,7 @@ static void __cpuinit lapic_setup_esr(void)
926 930
927 /* enables sending errors */ 931 /* enables sending errors */
928 value = ERROR_APIC_VECTOR; 932 value = ERROR_APIC_VECTOR;
929 apic_write_around(APIC_LVTERR, value); 933 apic_write(APIC_LVTERR, value);
930 /* 934 /*
931 * spec says clear errors after enabling vector. 935 * spec says clear errors after enabling vector.
932 */ 936 */
@@ -989,7 +993,7 @@ void __cpuinit setup_local_APIC(void)
989 */ 993 */
990 value = apic_read(APIC_TASKPRI); 994 value = apic_read(APIC_TASKPRI);
991 value &= ~APIC_TPRI_MASK; 995 value &= ~APIC_TPRI_MASK;
992 apic_write_around(APIC_TASKPRI, value); 996 apic_write(APIC_TASKPRI, value);
993 997
994 /* 998 /*
995 * After a crash, we no longer service the interrupts and a pending 999 * After a crash, we no longer service the interrupts and a pending
@@ -1047,7 +1051,7 @@ void __cpuinit setup_local_APIC(void)
1047 * Set spurious IRQ vector 1051 * Set spurious IRQ vector
1048 */ 1052 */
1049 value |= SPURIOUS_APIC_VECTOR; 1053 value |= SPURIOUS_APIC_VECTOR;
1050 apic_write_around(APIC_SPIV, value); 1054 apic_write(APIC_SPIV, value);
1051 1055
1052 /* 1056 /*
1053 * Set up LVT0, LVT1: 1057 * Set up LVT0, LVT1:
@@ -1069,7 +1073,7 @@ void __cpuinit setup_local_APIC(void)
1069 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1073 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1070 smp_processor_id()); 1074 smp_processor_id());
1071 } 1075 }
1072 apic_write_around(APIC_LVT0, value); 1076 apic_write(APIC_LVT0, value);
1073 1077
1074 /* 1078 /*
1075 * only the BP should see the LINT1 NMI signal, obviously. 1079 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1080,7 +1084,7 @@ void __cpuinit setup_local_APIC(void)
1080 value = APIC_DM_NMI | APIC_LVT_MASKED; 1084 value = APIC_DM_NMI | APIC_LVT_MASKED;
1081 if (!integrated) /* 82489DX */ 1085 if (!integrated) /* 82489DX */
1082 value |= APIC_LVT_LEVEL_TRIGGER; 1086 value |= APIC_LVT_LEVEL_TRIGGER;
1083 apic_write_around(APIC_LVT1, value); 1087 apic_write(APIC_LVT1, value);
1084} 1088}
1085 1089
1086void __cpuinit end_local_APIC_setup(void) 1090void __cpuinit end_local_APIC_setup(void)
@@ -1091,7 +1095,7 @@ void __cpuinit end_local_APIC_setup(void)
1091 /* Disable the local apic timer */ 1095 /* Disable the local apic timer */
1092 value = apic_read(APIC_LVTT); 1096 value = apic_read(APIC_LVTT);
1093 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1094 apic_write_around(APIC_LVTT, value); 1098 apic_write(APIC_LVTT, value);
1095 1099
1096 setup_apic_nmi_watchdog(NULL); 1100 setup_apic_nmi_watchdog(NULL);
1097 apic_pm_activate(); 1101 apic_pm_activate();
@@ -1214,9 +1218,6 @@ int apic_version[MAX_APICS];
1214 1218
1215int __init APIC_init_uniprocessor(void) 1219int __init APIC_init_uniprocessor(void)
1216{ 1220{
1217 if (disable_apic)
1218 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1219
1220 if (!smp_found_config && !cpu_has_apic) 1221 if (!smp_found_config && !cpu_has_apic)
1221 return -1; 1222 return -1;
1222 1223
@@ -1419,7 +1420,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1419 value &= ~APIC_VECTOR_MASK; 1420 value &= ~APIC_VECTOR_MASK;
1420 value |= APIC_SPIV_APIC_ENABLED; 1421 value |= APIC_SPIV_APIC_ENABLED;
1421 value |= 0xf; 1422 value |= 0xf;
1422 apic_write_around(APIC_SPIV, value); 1423 apic_write(APIC_SPIV, value);
1423 1424
1424 if (!virt_wire_setup) { 1425 if (!virt_wire_setup) {
1425 /* 1426 /*
@@ -1432,10 +1433,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1432 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1433 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1434 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1435 apic_write_around(APIC_LVT0, value); 1436 apic_write(APIC_LVT0, value);
1436 } else { 1437 } else {
1437 /* Disable LVT0 */ 1438 /* Disable LVT0 */
1438 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1439 } 1440 }
1440 1441
1441 /* 1442 /*
@@ -1449,7 +1450,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1449 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1450 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1451 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1452 apic_write_around(APIC_LVT1, value); 1453 apic_write(APIC_LVT1, value);
1453 } 1454 }
1454} 1455}
1455 1456
@@ -1700,7 +1701,7 @@ early_param("lapic", parse_lapic);
1700static int __init parse_nolapic(char *arg) 1701static int __init parse_nolapic(char *arg)
1701{ 1702{
1702 disable_apic = 1; 1703 disable_apic = 1;
1703 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1704 setup_clear_cpu_cap(X86_FEATURE_APIC);
1704 return 0; 1705 return 0;
1705} 1706}
1706early_param("nolapic", parse_nolapic); 1707early_param("nolapic", parse_nolapic);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1e3d32e27c14..7f1f030da7ee 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
54/* 54/*
55 * Debug level, exported for io_apic.c 55 * Debug level, exported for io_apic.c
56 */ 56 */
57int apic_verbosity; 57unsigned int apic_verbosity;
58 58
59/* Have we found an MP table */ 59/* Have we found an MP table */
60int smp_found_config; 60int smp_found_config;
@@ -314,7 +314,7 @@ static void setup_APIC_timer(void)
314 314
315#define TICK_COUNT 100000000 315#define TICK_COUNT 100000000
316 316
317static void __init calibrate_APIC_clock(void) 317static int __init calibrate_APIC_clock(void)
318{ 318{
319 unsigned apic, apic_start; 319 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start; 320 unsigned long tsc, tsc_start;
@@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void)
368 clockevent_delta2ns(0xF, &lapic_clockevent); 368 clockevent_delta2ns(0xF, &lapic_clockevent);
369 369
370 calibration_result = result / HZ; 370 calibration_result = result / HZ;
371
372 /*
373 * Do a sanity check on the APIC calibration result
374 */
375 if (calibration_result < (1000000 / HZ)) {
376 printk(KERN_WARNING
377 "APIC frequency too slow, disabling apic timer\n");
378 return -1;
379 }
380
381 return 0;
371} 382}
372 383
373/* 384/*
@@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void)
394 } 405 }
395 406
396 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 407 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock(); 408 if (calibrate_APIC_clock()) {
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */ 409 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1) 410 if (num_possible_cpus() > 1)
407 setup_APIC_timer(); 411 setup_APIC_timer();
@@ -1337,7 +1341,7 @@ early_param("apic", apic_set_verbosity);
1337static __init int setup_disableapic(char *str) 1341static __init int setup_disableapic(char *str)
1338{ 1342{
1339 disable_apic = 1; 1343 disable_apic = 1;
1340 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1344 setup_clear_cpu_cap(X86_FEATURE_APIC);
1341 return 0; 1345 return 0;
1342} 1346}
1343early_param("disableapic", setup_disableapic); 1347early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9b441331e9..9ee24e6bc4b0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,7 +219,6 @@
219#include <linux/time.h> 219#include <linux/time.h>
220#include <linux/sched.h> 220#include <linux/sched.h>
221#include <linux/pm.h> 221#include <linux/pm.h>
222#include <linux/pm_legacy.h>
223#include <linux/capability.h> 222#include <linux/capability.h>
224#include <linux/device.h> 223#include <linux/device.h>
225#include <linux/kernel.h> 224#include <linux/kernel.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..aa89387006fe 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,6 +18,8 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_64_UNISTD_H_
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..c639bd55391c
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,48 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21#include <asm/uv/bios.h>
22
23const char *
24x86_bios_strerror(long status)
25{
26 const char *str;
27 switch (status) {
28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break;
33 }
34 return str;
35}
36
37long
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second,
39 unsigned long *drift_info)
40{
41 struct uv_bios_retval isrv;
42
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0);
44 *ticks_per_second = isrv.v0;
45 *drift_info = isrv.v1;
46 return isrv.status;
47}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d44..cae9cabc3031 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,8 +24,6 @@
24extern void vide(void); 24extern void vide(void);
25__asm__(".align 4\nvide: ret"); 25__asm__(".align 4\nvide: ret");
26 26
27int force_mwait __cpuinitdata;
28
29static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
30{ 28{
31 if (cpuid_eax(0x80000000) >= 0x80000007) { 29 if (cpuid_eax(0x80000000) >= 0x80000007) {
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 7c36fb8a28d4..d1692b2a41ff 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8)) 116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
118} 120}
119 121
120static void __cpuinit init_amd(struct cpuinfo_x86 *c) 122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338f..c9b58a806e85 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -131,13 +131,7 @@ static void __init check_popad(void)
131 * (for due to lack of "invlpg" and working WP on a i386) 131 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be 132 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486. 133 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine, 134 */
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141 135
142static void __init check_config(void) 136static void __init check_config(void)
143{ 137{
@@ -151,21 +145,6 @@ static void __init check_config(void)
151 if (boot_cpu_data.x86 == 3) 145 if (boot_cpu_data.x86 == 3)
152 panic("Kernel requires i486+ for 'invlpg' and other features"); 146 panic("Kernel requires i486+ for 'invlpg' and other features");
153#endif 147#endif
154
155/*
156 * If we were told we had a good local APIC, check for buggy Pentia,
157 * i.e. all B steppings and the C2 stepping of P54C when using their
158 * integrated APIC (see 11AP erratum in "Pentium Processor
159 * Specification Update").
160 */
161#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
163 && cpu_has_apic
164 && boot_cpu_data.x86 == 5
165 && boot_cpu_data.x86_model == 2
166 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
167 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
168#endif
169} 148}
170 149
171 150
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 7b8cc72feb40..dd6e3f15017e 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -7,15 +7,13 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/kgdb.h> 8#include <linux/kgdb.h>
9#include <linux/topology.h> 9#include <linux/topology.h>
10#include <linux/string.h>
11#include <linux/delay.h> 10#include <linux/delay.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
13#include <linux/module.h>
14#include <linux/percpu.h> 12#include <linux/percpu.h>
15#include <asm/processor.h>
16#include <asm/i387.h> 13#include <asm/i387.h>
17#include <asm/msr.h> 14#include <asm/msr.h>
18#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
19#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
20#include <asm/mtrr.h> 18#include <asm/mtrr.h>
21#include <asm/mce.h> 19#include <asm/mce.h>
@@ -305,7 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
305 c->x86_capability[2] = cpuid_edx(0x80860001); 303 c->x86_capability[2] = cpuid_edx(0x80860001);
306 } 304 }
307 305
308 c->extended_cpuid_level = cpuid_eax(0x80000000);
309 if (c->extended_cpuid_level >= 0x80000007) 306 if (c->extended_cpuid_level >= 0x80000007)
310 c->x86_power = cpuid_edx(0x80000007); 307 c->x86_power = cpuid_edx(0x80000007);
311 308
@@ -316,18 +313,11 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
316 c->x86_phys_bits = eax & 0xff; 313 c->x86_phys_bits = eax & 0xff;
317 } 314 }
318 315
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 316 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init) 317 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c); 318 cpu_devs[c->x86_vendor]->c_early_init(c);
325 319
326 validate_pat_support(c); 320 validate_pat_support(c);
327
328 /* early_param could clear that, but recall get it set again */
329 if (disable_apic)
330 clear_cpu_cap(c, X86_FEATURE_APIC);
331} 321}
332 322
333/* 323/*
@@ -517,8 +507,7 @@ void pda_init(int cpu)
517} 507}
518 508
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 509char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ] 510 DEBUG_STKSZ] __page_aligned_bss;
521__attribute__((section(".bss.page_aligned")));
522 511
523extern asmlinkage void ignore_sysret(void); 512extern asmlinkage void ignore_sysret(void);
524 513
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..ff2fff56f0a8 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -200,10 +200,12 @@ static void drv_read(struct drv_cmd *cmd)
200static void drv_write(struct drv_cmd *cmd) 200static void drv_write(struct drv_cmd *cmd)
201{ 201{
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 cpumask_of_cpu_ptr_declare(cpu_mask);
203 unsigned int i; 204 unsigned int i;
204 205
205 for_each_cpu_mask(i, cmd->mask) { 206 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 207 cpumask_of_cpu_ptr_next(cpu_mask, i);
208 set_cpus_allowed_ptr(current, cpu_mask);
207 do_drv_write(cmd); 209 do_drv_write(cmd);
208 } 210 }
209 211
@@ -267,11 +269,12 @@ static unsigned int get_measured_perf(unsigned int cpu)
267 } aperf_cur, mperf_cur; 269 } aperf_cur, mperf_cur;
268 270
269 cpumask_t saved_mask; 271 cpumask_t saved_mask;
272 cpumask_of_cpu_ptr(cpu_mask, cpu);
270 unsigned int perf_percent; 273 unsigned int perf_percent;
271 unsigned int retval; 274 unsigned int retval;
272 275
273 saved_mask = current->cpus_allowed; 276 saved_mask = current->cpus_allowed;
274 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 277 set_cpus_allowed_ptr(current, cpu_mask);
275 if (get_cpu() != cpu) { 278 if (get_cpu() != cpu) {
276 /* We were not able to run on requested processor */ 279 /* We were not able to run on requested processor */
277 put_cpu(); 280 put_cpu();
@@ -337,6 +340,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
337 340
338static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 341static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
339{ 342{
343 cpumask_of_cpu_ptr(cpu_mask, cpu);
340 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 344 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
341 unsigned int freq; 345 unsigned int freq;
342 unsigned int cached_freq; 346 unsigned int cached_freq;
@@ -349,7 +353,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
349 } 353 }
350 354
351 cached_freq = data->freq_table[data->acpi_data->state].frequency; 355 cached_freq = data->freq_table[data->acpi_data->state].frequency;
352 freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); 356 freq = extract_freq(get_cur_val(cpu_mask), data);
353 if (freq != cached_freq) { 357 if (freq != cached_freq) {
354 /* 358 /*
355 * The dreaded BIOS frequency change behind our back. 359 * The dreaded BIOS frequency change behind our back.
@@ -451,7 +455,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 455
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 456 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 457 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 458 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 459 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 460 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 461 }
@@ -466,7 +470,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 470 }
467 } 471 }
468 472
469 for_each_cpu_mask(i, cmd.mask) { 473 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 474 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 475 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 476 }
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..f1685fb91fbd 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
1/* 1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones. 2 * (C) 2003 Dave Jones.
4 * 3 *
5 * Licensed under the terms of the GNU GPL License version 2. 4 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..53c7b6936973 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -479,11 +479,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
479static int check_supported_cpu(unsigned int cpu) 479static int check_supported_cpu(unsigned int cpu)
480{ 480{
481 cpumask_t oldmask; 481 cpumask_t oldmask;
482 cpumask_of_cpu_ptr(cpu_mask, cpu);
482 u32 eax, ebx, ecx, edx; 483 u32 eax, ebx, ecx, edx;
483 unsigned int rc = 0; 484 unsigned int rc = 0;
484 485
485 oldmask = current->cpus_allowed; 486 oldmask = current->cpus_allowed;
486 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 487 set_cpus_allowed_ptr(current, cpu_mask);
487 488
488 if (smp_processor_id() != cpu) { 489 if (smp_processor_id() != cpu) {
489 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); 490 printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -966,7 +967,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 967 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 968 freqs.new = find_khz_freq_from_fid(fid);
968 969
969 for_each_cpu_mask(i, *(data->available_cores)) { 970 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 971 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 972 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 973 }
@@ -974,7 +975,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 975 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 976 freqs.new = find_khz_freq_from_fid(data->currfid);
976 977
977 for_each_cpu_mask(i, *(data->available_cores)) { 978 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 979 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 980 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 981 }
@@ -997,7 +998,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 998 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 999 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 1000
1000 for_each_cpu_mask(i, *(data->available_cores)) { 1001 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 1002 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1003 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1004 }
@@ -1005,7 +1006,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1006 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1007 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1008
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1009 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1010 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1011 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1012 }
@@ -1016,6 +1017,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1016static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1017static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1017{ 1018{
1018 cpumask_t oldmask; 1019 cpumask_t oldmask;
1020 cpumask_of_cpu_ptr(cpu_mask, pol->cpu);
1019 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1021 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1020 u32 checkfid; 1022 u32 checkfid;
1021 u32 checkvid; 1023 u32 checkvid;
@@ -1030,7 +1032,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
1030 1032
1031 /* only run on specific CPU from here on */ 1033 /* only run on specific CPU from here on */
1032 oldmask = current->cpus_allowed; 1034 oldmask = current->cpus_allowed;
1033 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1035 set_cpus_allowed_ptr(current, cpu_mask);
1034 1036
1035 if (smp_processor_id() != pol->cpu) { 1037 if (smp_processor_id() != pol->cpu) {
1036 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1038 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1105,6 +1107,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1105{ 1107{
1106 struct powernow_k8_data *data; 1108 struct powernow_k8_data *data;
1107 cpumask_t oldmask; 1109 cpumask_t oldmask;
1110 cpumask_of_cpu_ptr_declare(newmask);
1108 int rc; 1111 int rc;
1109 1112
1110 if (!cpu_online(pol->cpu)) 1113 if (!cpu_online(pol->cpu))
@@ -1156,7 +1159,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1156 1159
1157 /* only run on specific CPU from here on */ 1160 /* only run on specific CPU from here on */
1158 oldmask = current->cpus_allowed; 1161 oldmask = current->cpus_allowed;
1159 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1162 cpumask_of_cpu_ptr_next(newmask, pol->cpu);
1163 set_cpus_allowed_ptr(current, newmask);
1160 1164
1161 if (smp_processor_id() != pol->cpu) { 1165 if (smp_processor_id() != pol->cpu) {
1162 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1166 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1178,7 +1182,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1178 set_cpus_allowed_ptr(current, &oldmask); 1182 set_cpus_allowed_ptr(current, &oldmask);
1179 1183
1180 if (cpu_family == CPU_HW_PSTATE) 1184 if (cpu_family == CPU_HW_PSTATE)
1181 pol->cpus = cpumask_of_cpu(pol->cpu); 1185 pol->cpus = *newmask;
1182 else 1186 else
1183 pol->cpus = per_cpu(cpu_core_map, pol->cpu); 1187 pol->cpus = per_cpu(cpu_core_map, pol->cpu);
1184 data->available_cores = &(pol->cpus); 1188 data->available_cores = &(pol->cpus);
@@ -1244,6 +1248,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1244{ 1248{
1245 struct powernow_k8_data *data; 1249 struct powernow_k8_data *data;
1246 cpumask_t oldmask = current->cpus_allowed; 1250 cpumask_t oldmask = current->cpus_allowed;
1251 cpumask_of_cpu_ptr(newmask, cpu);
1247 unsigned int khz = 0; 1252 unsigned int khz = 0;
1248 unsigned int first; 1253 unsigned int first;
1249 1254
@@ -1253,7 +1258,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1253 if (!data) 1258 if (!data)
1254 return -EINVAL; 1259 return -EINVAL;
1255 1260
1256 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 1261 set_cpus_allowed_ptr(current, newmask);
1257 if (smp_processor_id() != cpu) { 1262 if (smp_processor_id() != cpu) {
1258 printk(KERN_ERR PFX 1263 printk(KERN_ERR PFX
1259 "limiting to CPU %d failed in powernowk8_get\n", cpu); 1264 "limiting to CPU %d failed in powernowk8_get\n", cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..ca2ac13b7af2 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -28,7 +28,8 @@
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@lists.linux.org.uk"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -313,9 +324,10 @@ static unsigned int get_cur_freq(unsigned int cpu)
313 unsigned l, h; 324 unsigned l, h;
314 unsigned clock_freq; 325 unsigned clock_freq;
315 cpumask_t saved_mask; 326 cpumask_t saved_mask;
327 cpumask_of_cpu_ptr(new_mask, cpu);
316 328
317 saved_mask = current->cpus_allowed; 329 saved_mask = current->cpus_allowed;
318 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 330 set_cpus_allowed_ptr(current, new_mask);
319 if (smp_processor_id() != cpu) 331 if (smp_processor_id() != cpu)
320 return 0; 332 return 0;
321 333
@@ -347,7 +359,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 359 int i;
348 360
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 361 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 362 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
363 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 364 return -ENODEV;
352 365
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 366 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +374,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 374 break;
362 375
363 if (i != N_IDS) 376 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 377 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 378
366 if (!centrino_cpu[policy->cpu]) { 379 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 380 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 381 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 382 MAINTAINER "\n");
@@ -386,23 +399,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 399 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 400 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 401 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 402 printk(KERN_INFO PFX
403 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 404 return -ENODEV;
391 } 405 }
392 } 406 }
393 407
394 freq = get_cur_freq(policy->cpu); 408 freq = get_cur_freq(policy->cpu);
395 409 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 410 /* 10uS transition latency */
397 policy->cur = freq; 411 policy->cur = freq;
398 412
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 413 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 414
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 415 ret = cpufreq_frequency_table_cpuinfo(policy,
416 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 417 if (ret)
403 return (ret); 418 return (ret);
404 419
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 420 cpufreq_frequency_table_get_attr(
421 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 422
407 return 0; 423 return 0;
408} 424}
@@ -411,12 +427,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 427{
412 unsigned int cpu = policy->cpu; 428 unsigned int cpu = policy->cpu;
413 429
414 if (!centrino_model[cpu]) 430 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 431 return -ENODEV;
416 432
417 cpufreq_frequency_table_put_attr(cpu); 433 cpufreq_frequency_table_put_attr(cpu);
418 434
419 centrino_model[cpu] = NULL; 435 per_cpu(centrino_model, cpu) = NULL;
420 436
421 return 0; 437 return 0;
422} 438}
@@ -430,17 +446,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 446 */
431static int centrino_verify (struct cpufreq_policy *policy) 447static int centrino_verify (struct cpufreq_policy *policy)
432{ 448{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 449 return cpufreq_frequency_table_verify(policy,
450 per_cpu(centrino_model, policy->cpu)->op_points);
434} 451}
435 452
436/** 453/**
437 * centrino_setpolicy - set a new CPUFreq policy 454 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 455 * @policy: new policy
439 * @target_freq: the target frequency 456 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 457 * @relation: how that frequency relates to achieved frequency
458 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 459 *
442 * Sets a new CPUFreq policy. 460 * Sets a new CPUFreq policy.
443 */ 461 */
462struct allmasks {
463 cpumask_t online_policy_cpus;
464 cpumask_t saved_mask;
465 cpumask_t set_mask;
466 cpumask_t covered_cpus;
467};
468
444static int centrino_target (struct cpufreq_policy *policy, 469static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 470 unsigned int target_freq,
446 unsigned int relation) 471 unsigned int relation)
@@ -448,48 +473,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 473 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 474 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 475 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 476 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 477 unsigned int j, k, first_cpu, tmp;
457 478 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 479 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 480 CPUMASK_PTR(saved_mask, allmasks);
481 CPUMASK_PTR(set_mask, allmasks);
482 CPUMASK_PTR(covered_cpus, allmasks);
483
484 if (unlikely(allmasks == NULL))
485 return -ENOMEM;
486
487 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
488 retval = -ENODEV;
489 goto out;
490 }
460 491
461 if (unlikely(cpufreq_frequency_table_target(policy, 492 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 493 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 494 target_freq,
464 relation, 495 relation,
465 &newstate))) { 496 &newstate))) {
466 return -EINVAL; 497 retval = -EINVAL;
498 goto out;
467 } 499 }
468 500
469#ifdef CONFIG_HOTPLUG_CPU 501#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 502 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 503 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 504#else
473 online_policy_cpus = policy->cpus; 505 *online_policy_cpus = policy->cpus;
474#endif 506#endif
475 507
476 saved_mask = current->cpus_allowed; 508 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 509 first_cpu = 1;
478 cpus_clear(covered_cpus); 510 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 511 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 512 /*
481 * Support for SMP systems. 513 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 514 * Make sure we are running on CPU that wants to change freq
483 */ 515 */
484 cpus_clear(set_mask); 516 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 517 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 518 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 519 else
488 cpu_set(j, set_mask); 520 cpu_set(j, *set_mask);
489 521
490 set_cpus_allowed_ptr(current, &set_mask); 522 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 523 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 524 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 525 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 526 retval = -EAGAIN;
495 if (first_cpu) { 527 if (first_cpu) {
@@ -500,7 +532,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 532 break;
501 } 533 }
502 534
503 msr = centrino_model[cpu]->op_points[newstate].index; 535 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 536
505 if (first_cpu) { 537 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 538 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +549,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 549 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 550 target_freq, freqs.old, freqs.new, msr);
519 551
520 for_each_cpu_mask(k, online_policy_cpus) { 552 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 553 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 554 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 555 CPUFREQ_PRECHANGE);
@@ -536,11 +568,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 568 break;
537 } 569 }
538 570
539 cpu_set(j, covered_cpus); 571 cpu_set(j, *covered_cpus);
540 preempt_enable(); 572 preempt_enable();
541 } 573 }
542 574
543 for_each_cpu_mask(k, online_policy_cpus) { 575 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 576 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 577 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 578 }
@@ -553,10 +585,12 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 585 * Best effort undo..
554 */ 586 */
555 587
556 if (!cpus_empty(covered_cpus)) { 588 if (!cpus_empty(*covered_cpus)) {
557 for_each_cpu_mask(j, covered_cpus) { 589 cpumask_of_cpu_ptr_declare(new_mask);
558 set_cpus_allowed_ptr(current, 590
559 &cpumask_of_cpu(j)); 591 for_each_cpu_mask_nr(j, *covered_cpus) {
592 cpumask_of_cpu_ptr_next(new_mask, j);
593 set_cpus_allowed_ptr(current, new_mask);
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 594 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 595 }
562 } 596 }
@@ -564,19 +598,22 @@ static int centrino_target (struct cpufreq_policy *policy,
564 tmp = freqs.new; 598 tmp = freqs.new;
565 freqs.new = freqs.old; 599 freqs.new = freqs.old;
566 freqs.old = tmp; 600 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 601 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 602 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 603 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 604 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 605 }
572 } 606 }
573 set_cpus_allowed_ptr(current, &saved_mask); 607 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 608 retval = 0;
609 goto out;
575 610
576migrate_end: 611migrate_end:
577 preempt_enable(); 612 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 613 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 614out:
615 CPUMASK_FREE(allmasks);
616 return retval;
580} 617}
581 618
582static struct freq_attr* centrino_attr[] = { 619static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..2f3728dc24f6 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -244,7 +244,8 @@ static unsigned int _speedstep_get(const cpumask_t *cpus)
244 244
245static unsigned int speedstep_get(unsigned int cpu) 245static unsigned int speedstep_get(unsigned int cpu)
246{ 246{
247 return _speedstep_get(&cpumask_of_cpu(cpu)); 247 cpumask_of_cpu_ptr(newmask, cpu);
248 return _speedstep_get(newmask);
248} 249}
249 250
250/** 251/**
@@ -279,7 +280,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 280
280 cpus_allowed = current->cpus_allowed; 281 cpus_allowed = current->cpus_allowed;
281 282
282 for_each_cpu_mask(i, policy->cpus) { 283 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 284 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 285 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 286 }
@@ -292,7 +293,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 293 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 294 set_cpus_allowed_ptr(current, &cpus_allowed);
294 295
295 for_each_cpu_mask(i, policy->cpus) { 296 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 297 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 298 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 299 }
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1da..b75f2569b8f8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
227 if (cpu_has_bts) 227 if (cpu_has_bts)
228 ds_init_intel(c); 228 ds_init_intel(c);
229 229
230 /*
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239
230#ifdef CONFIG_X86_NUMAQ 240#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable(); 241 numaq_tsc_disable();
232#endif 242#endif
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e8..650d40f7912b 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -489,7 +489,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
489 int sibling; 489 int sibling;
490 490
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 491 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 493 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 494 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 495 }
@@ -516,6 +516,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
516 unsigned long j; 516 unsigned long j;
517 int retval; 517 int retval;
518 cpumask_t oldmask; 518 cpumask_t oldmask;
519 cpumask_of_cpu_ptr(newmask, cpu);
519 520
520 if (num_cache_leaves == 0) 521 if (num_cache_leaves == 0)
521 return -ENOENT; 522 return -ENOENT;
@@ -526,7 +527,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
526 return -ENOMEM; 527 return -ENOMEM;
527 528
528 oldmask = current->cpus_allowed; 529 oldmask = current->cpus_allowed;
529 retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 530 retval = set_cpus_allowed_ptr(current, newmask);
530 if (retval) 531 if (retval)
531 goto out; 532 goto out;
532 533
@@ -780,15 +781,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
780 } 781 }
781 kobject_put(per_cpu(cache_kobject, cpu)); 782 kobject_put(per_cpu(cache_kobject, cpu));
782 cpuid4_cache_sysfs_exit(cpu); 783 cpuid4_cache_sysfs_exit(cpu);
783 break; 784 return retval;
784 } 785 }
785 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 786 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
786 } 787 }
787 if (!retval) 788 cpu_set(cpu, cache_dev_map);
788 cpu_set(cpu, cache_dev_map);
789 789
790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
791 return retval; 791 return 0;
792} 792}
793 793
794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..65a339678ece 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
580 char __user *buf = ubuf; 580 char __user *buf = ubuf;
581 int i, err; 581 int i, err;
582 582
583 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
584 if (!cpu_tsc) 584 if (!cpu_tsc)
585 return -ENOMEM; 585 return -ENOMEM;
586 586
@@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
762 762
763/* Why are there no generic functions for this? */ 763/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 764#define ACCESSOR(name, var, start) \
765 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 765 static ssize_t show_ ## name(struct sys_device *s, \
766 struct sysdev_attribute *attr, \
767 char *buf) { \
766 return sprintf(buf, "%lx\n", (unsigned long)var); \ 768 return sprintf(buf, "%lx\n", (unsigned long)var); \
767 } \ 769 } \
768 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 770 static ssize_t set_ ## name(struct sys_device *s, \
771 struct sysdev_attribute *attr, \
772 const char *buf, size_t siz) { \
769 char *end; \ 773 char *end; \
770 unsigned long new = simple_strtoul(buf, &end, 0); \ 774 unsigned long new = simple_strtoul(buf, &end, 0); \
771 if (end == buf) return -EINVAL; \ 775 if (end == buf) return -EINVAL; \
@@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
786ACCESSOR(bank4ctl,bank[4],mce_restart()) 790ACCESSOR(bank4ctl,bank[4],mce_restart())
787ACCESSOR(bank5ctl,bank[5],mce_restart()) 791ACCESSOR(bank5ctl,bank[5],mce_restart())
788 792
789static ssize_t show_trigger(struct sys_device *s, char *buf) 793static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
794 char *buf)
790{ 795{
791 strcpy(buf, trigger); 796 strcpy(buf, trigger);
792 strcat(buf, "\n"); 797 strcat(buf, "\n");
793 return strlen(trigger) + 1; 798 return strlen(trigger) + 1;
794} 799}
795 800
796static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 801static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
802 const char *buf,size_t siz)
797{ 803{
798 char *p; 804 char *p;
799 int len; 805 int len;
@@ -806,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
806} 812}
807 813
808static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 814static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
809ACCESSOR(tolerant,tolerant,) 815static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
810ACCESSOR(check_interval,check_interval,mce_restart()) 816ACCESSOR(check_interval,check_interval,mce_restart())
811static struct sysdev_attribute *mce_attributes[] = { 817static struct sysdev_attribute *mce_attributes[] = {
812 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 818 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
813 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 819 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
814 &attr_tolerant, &attr_check_interval, &attr_trigger, 820 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
815 NULL 821 NULL
816}; 822};
817 823
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..88736cadbaa6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bde..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6d4bdc02388a..de7439f82b92 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -250,7 +250,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
250 250
251 do_div(count, nmi_hz); 251 do_div(count, nmi_hz);
252 if(descr) 252 if(descr)
253 Dprintk("setting %s to -0x%08Lx\n", descr, count); 253 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsrl(perfctr_msr, 0 - count); 254 wrmsrl(perfctr_msr, 0 - count);
255} 255}
256 256
@@ -261,7 +261,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
261 261
262 do_div(count, nmi_hz); 262 do_div(count, nmi_hz);
263 if(descr) 263 if(descr)
264 Dprintk("setting %s to -0x%08Lx\n", descr, count); 264 pr_debug("setting %s to -0x%08Lx\n", descr, count);
265 wrmsr(perfctr_msr, (u32)(-count), 0); 265 wrmsr(perfctr_msr, (u32)(-count), 0);
266} 266}
267 267
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..a26c480b9491 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 if ((*pos) < nr_cpu_ids && cpu_online(*pos))
164 return &cpu_data(*pos); 164 return &cpu_data(*pos);
165 return NULL; 165 return NULL;
166} 166}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2de5fa2bbf77..14b11b3be31c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -141,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu)
141{ 141{
142 struct device *dev; 142 struct device *dev;
143 143
144 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 144 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
145 "cpu%d", cpu); 145 NULL, "cpu%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 146 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 147}
148 148
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b380..9af89078f7bb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++; 878 count++;
879 879
880 printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); 880 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
881 count, start, end);
881 for (i = 0; i < count; i++) { 882 for (i = 0; i < count; i++) {
882 struct early_res *r = &early_res[i]; 883 struct early_res *r = &early_res[i];
883 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 884 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@@ -1298,11 +1299,6 @@ void __init e820_reserve_resources(void)
1298 } 1299 }
1299} 1300}
1300 1301
1301/*
1302 * Non-standard memory setup can be specified via this quirk:
1303 */
1304char * (*arch_memory_setup_quirk)(void);
1305
1306char *__init default_machine_specific_memory_setup(void) 1302char *__init default_machine_specific_memory_setup(void)
1307{ 1303{
1308 char *who = "BIOS-e820"; 1304 char *who = "BIOS-e820";
@@ -1343,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void)
1343 1339
1344char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1340char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1345{ 1341{
1346 if (arch_memory_setup_quirk) { 1342 if (x86_quirks->arch_memory_setup) {
1347 char *who = arch_memory_setup_quirk(); 1343 char *who = x86_quirks->arch_memory_setup();
1348 1344
1349 if (who) 1345 if (who)
1350 return who; 1346 return who;
@@ -1367,24 +1363,3 @@ void __init setup_memory_map(void)
1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1363 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1368 e820_print_map(who); 1364 e820_print_map(who);
1369} 1365}
1370
1371#ifdef CONFIG_X86_64
1372int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
1373{
1374 int i;
1375
1376 if (slot < 0 || slot >= e820.nr_map)
1377 return -1;
1378 for (i = slot; i < e820.nr_map; i++) {
1379 if (e820.map[i].type != E820_RAM)
1380 continue;
1381 break;
1382 }
1383 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
1384 return -1;
1385 *addr = e820.map[i].addr;
1386 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
1387 max_pfn << PAGE_SHIFT) - *addr;
1388 return i + 1;
1389}
1390#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc872..4353cf5e6fac 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6bc07f0f1202..109792bc7cfa 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,16 @@
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56 56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
66
57/* 67/*
58 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
59 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
@@ -332,8 +342,9 @@ sysenter_past_esp:
332 GET_THREAD_INFO(%ebp) 342 GET_THREAD_INFO(%ebp)
333 343
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
337 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 349 jae syscall_badsys
339 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -343,7 +354,8 @@ sysenter_past_esp:
343 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
344 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
345 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
346 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
347/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
348 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
349 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
@@ -351,6 +363,45 @@ sysenter_past_esp:
351 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3521: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
354 CFI_ENDPROC 405 CFI_ENDPROC
355.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3562: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
@@ -370,7 +421,7 @@ ENTRY(system_call)
370 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
371 # system call tracing in operation / emulation 422 # system call tracing in operation / emulation
372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
373 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
374 jnz syscall_trace_entry 425 jnz syscall_trace_entry
375 cmpl $(nr_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
376 jae syscall_badsys 427 jae syscall_badsys
@@ -383,10 +434,6 @@ syscall_exit:
383 # setting need_resched or sigpending 434 # setting need_resched or sigpending
384 # between sampling and the iret 435 # between sampling and the iret
385 TRACE_IRQS_OFF 436 TRACE_IRQS_OFF
386 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
387 jz no_singlestep
388 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
389no_singlestep:
390 movl TI_flags(%ebp), %ecx 437 movl TI_flags(%ebp), %ecx
391 testw $_TIF_ALLWORK_MASK, %cx # current->work 438 testw $_TIF_ALLWORK_MASK, %cx # current->work
392 jne syscall_exit_work 439 jne syscall_exit_work
@@ -514,12 +561,8 @@ END(work_pending)
514syscall_trace_entry: 561syscall_trace_entry:
515 movl $-ENOSYS,PT_EAX(%esp) 562 movl $-ENOSYS,PT_EAX(%esp)
516 movl %esp, %eax 563 movl %esp, %eax
517 xorl %edx,%edx 564 call syscall_trace_enter
518 call do_syscall_trace 565 /* What it returned is what we'll actually use. */
519 cmpl $0, %eax
520 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
521 # so must skip actual syscall
522 movl PT_ORIG_EAX(%esp), %eax
523 cmpl $(nr_syscalls), %eax 566 cmpl $(nr_syscalls), %eax
524 jnae syscall_call 567 jnae syscall_call
525 jmp syscall_exit 568 jmp syscall_exit
@@ -528,14 +571,13 @@ END(syscall_trace_entry)
528 # perform syscall exit tracing 571 # perform syscall exit tracing
529 ALIGN 572 ALIGN
530syscall_exit_work: 573syscall_exit_work:
531 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 574 testb $_TIF_WORK_SYSCALL_EXIT, %cl
532 jz work_pending 575 jz work_pending
533 TRACE_IRQS_ON 576 TRACE_IRQS_ON
534 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
535 # schedule() instead 578 # schedule() instead
536 movl %esp, %eax 579 movl %esp, %eax
537 movl $1, %edx 580 call syscall_trace_leave
538 call do_syscall_trace
539 jmp resume_userspace 581 jmp resume_userspace
540END(syscall_exit_work) 582END(syscall_exit_work)
541 CFI_ENDPROC 583 CFI_ENDPROC
@@ -1024,6 +1066,7 @@ ENDPROC(kernel_thread_helper)
1024ENTRY(xen_sysenter_target) 1066ENTRY(xen_sysenter_target)
1025 RING0_INT_FRAME 1067 RING0_INT_FRAME
1026 addl $5*4, %esp /* remove xen-provided frame */ 1068 addl $5*4, %esp /* remove xen-provided frame */
1069 CFI_ADJUST_CFA_OFFSET -5*4
1027 jmp sysenter_past_esp 1070 jmp sysenter_past_esp
1028 CFI_ENDPROC 1071 CFI_ENDPROC
1029 1072
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..89434d439605 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,6 +53,12 @@
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55 55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
61
56 .code64 62 .code64
57 63
58#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
@@ -349,9 +355,9 @@ ENTRY(system_call_after_swapgs)
349 movq %rcx,RIP-ARGOFFSET(%rsp) 355 movq %rcx,RIP-ARGOFFSET(%rsp)
350 CFI_REL_OFFSET rip,RIP-ARGOFFSET 356 CFI_REL_OFFSET rip,RIP-ARGOFFSET
351 GET_THREAD_INFO(%rcx) 357 GET_THREAD_INFO(%rcx)
352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 358 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 TI_flags(%rcx)
354 jnz tracesys 359 jnz tracesys
360system_call_fastpath:
355 cmpq $__NR_syscall_max,%rax 361 cmpq $__NR_syscall_max,%rax
356 ja badsys 362 ja badsys
357 movq %r10,%rcx 363 movq %r10,%rcx
@@ -403,16 +409,16 @@ sysret_careful:
403sysret_signal: 409sysret_signal:
404 TRACE_IRQS_ON 410 TRACE_IRQS_ON
405 ENABLE_INTERRUPTS(CLBR_NONE) 411 ENABLE_INTERRUPTS(CLBR_NONE)
406 testl $_TIF_DO_NOTIFY_MASK,%edx 412#ifdef CONFIG_AUDITSYSCALL
407 jz 1f 413 bt $TIF_SYSCALL_AUDIT,%edx
408 414 jc sysret_audit
409 /* Really a signal */ 415#endif
410 /* edx: work flags (arg3) */ 416 /* edx: work flags (arg3) */
411 leaq do_notify_resume(%rip),%rax 417 leaq do_notify_resume(%rip),%rax
412 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 418 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
413 xorl %esi,%esi # oldset -> arg2 419 xorl %esi,%esi # oldset -> arg2
414 call ptregscall_common 420 call ptregscall_common
4151: movl $_TIF_WORK_MASK,%edi 421 movl $_TIF_WORK_MASK,%edi
416 /* Use IRET because user could have changed frame. This 422 /* Use IRET because user could have changed frame. This
417 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 423 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
418 DISABLE_INTERRUPTS(CLBR_NONE) 424 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -423,14 +429,56 @@ badsys:
423 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 429 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
424 jmp ret_from_sys_call 430 jmp ret_from_sys_call
425 431
432#ifdef CONFIG_AUDITSYSCALL
433 /*
434 * Fast path for syscall audit without full syscall trace.
435 * We just call audit_syscall_entry() directly, and then
436 * jump back to the normal fast path.
437 */
438auditsys:
439 movq %r10,%r9 /* 6th arg: 4th syscall arg */
440 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
441 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
442 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
443 movq %rax,%rsi /* 2nd arg: syscall number */
444 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
445 call audit_syscall_entry
446 LOAD_ARGS 0 /* reload call-clobbered registers */
447 jmp system_call_fastpath
448
449 /*
450 * Return fast path for syscall audit. Call audit_syscall_exit()
451 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
452 * masked off.
453 */
454sysret_audit:
455 movq %rax,%rsi /* second arg, syscall return value */
456 cmpq $0,%rax /* is it < 0? */
457 setl %al /* 1 if so, 0 if not */
458 movzbl %al,%edi /* zero-extend that into %edi */
459 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
460 call audit_syscall_exit
461 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
462 jmp sysret_check
463#endif /* CONFIG_AUDITSYSCALL */
464
426 /* Do syscall tracing */ 465 /* Do syscall tracing */
427tracesys: 466tracesys:
467#ifdef CONFIG_AUDITSYSCALL
468 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
469 jz auditsys
470#endif
428 SAVE_REST 471 SAVE_REST
429 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 472 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
430 FIXUP_TOP_OF_STACK %rdi 473 FIXUP_TOP_OF_STACK %rdi
431 movq %rsp,%rdi 474 movq %rsp,%rdi
432 call syscall_trace_enter 475 call syscall_trace_enter
433 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 476 /*
477 * Reload arg registers from stack in case ptrace changed them.
478 * We don't reload %rax because syscall_trace_enter() returned
479 * the value it wants us to use in the table lookup.
480 */
481 LOAD_ARGS ARGOFFSET, 1
434 RESTORE_REST 482 RESTORE_REST
435 cmpq $__NR_syscall_max,%rax 483 cmpq $__NR_syscall_max,%rax
436 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 484 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -444,6 +492,7 @@ tracesys:
444 * Has correct top of stack, but partial stack frame. 492 * Has correct top of stack, but partial stack frame.
445 */ 493 */
446 .globl int_ret_from_sys_call 494 .globl int_ret_from_sys_call
495 .globl int_with_check
447int_ret_from_sys_call: 496int_ret_from_sys_call:
448 DISABLE_INTERRUPTS(CLBR_NONE) 497 DISABLE_INTERRUPTS(CLBR_NONE)
449 TRACE_IRQS_OFF 498 TRACE_IRQS_OFF
@@ -483,7 +532,7 @@ int_very_careful:
483 ENABLE_INTERRUPTS(CLBR_NONE) 532 ENABLE_INTERRUPTS(CLBR_NONE)
484 SAVE_REST 533 SAVE_REST
485 /* Check for syscall exit trace */ 534 /* Check for syscall exit trace */
486 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 535 testl $_TIF_WORK_SYSCALL_EXIT,%edx
487 jz int_signal 536 jz int_signal
488 pushq %rdi 537 pushq %rdi
489 CFI_ADJUST_CFA_OFFSET 8 538 CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +540,7 @@ int_very_careful:
491 call syscall_trace_leave 540 call syscall_trace_leave
492 popq %rdi 541 popq %rdi
493 CFI_ADJUST_CFA_OFFSET -8 542 CFI_ADJUST_CFA_OFFSET -8
494 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 543 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
495 jmp int_restore_rest 544 jmp int_restore_rest
496 545
497int_signal: 546int_signal:
@@ -1189,6 +1238,7 @@ END(device_not_available)
1189 /* runs on exception stack */ 1238 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1239KPROBE_ENTRY(debug)
1191 INTR_FRAME 1240 INTR_FRAME
1241 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1242 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1243 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1244 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1248,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1248 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1249KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1250 INTR_FRAME
1251 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1252 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1253 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1254 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1262,7 @@ KPROBE_END(nmi)
1211 1262
1212KPROBE_ENTRY(int3) 1263KPROBE_ENTRY(int3)
1213 INTR_FRAME 1264 INTR_FRAME
1265 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1266 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1267 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1268 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1289,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1289 /* runs on exception stack */
1238ENTRY(double_fault) 1290ENTRY(double_fault)
1239 XCPT_FRAME 1291 XCPT_FRAME
1292 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1293 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1294 jmp paranoid_exit1
1242 CFI_ENDPROC 1295 CFI_ENDPROC
@@ -1253,6 +1306,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1306 /* runs on exception stack */
1254ENTRY(stack_segment) 1307ENTRY(stack_segment)
1255 XCPT_FRAME 1308 XCPT_FRAME
1309 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1310 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1311 jmp paranoid_exit1
1258 CFI_ENDPROC 1312 CFI_ENDPROC
@@ -1278,6 +1332,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1332 /* runs on exception stack */
1279ENTRY(machine_check) 1333ENTRY(machine_check)
1280 INTR_FRAME 1334 INTR_FRAME
1335 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1336 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1337 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1338 paranoidentry do_machine_check
@@ -1312,3 +1367,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1367 sysret
1313 CFI_ENDPROC 1368 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1369ENDPROC(ignore_sysret)
1370
1371#ifdef CONFIG_XEN
1372ENTRY(xen_hypervisor_callback)
1373 zeroentry xen_do_hypervisor_callback
1374END(xen_hypervisor_callback)
1375
1376/*
1377# A note on the "critical region" in our callback handler.
1378# We want to avoid stacking callback handlers due to events occurring
1379# during handling of the last event. To do this, we keep events disabled
1380# until we've done all processing. HOWEVER, we must enable events before
1381# popping the stack frame (can't be done atomically) and so it would still
1382# be possible to get enough handler activations to overflow the stack.
1383# Although unlikely, bugs of that kind are hard to track down, so we'd
1384# like to avoid the possibility.
1385# So, on entry to the handler we detect whether we interrupted an
1386# existing activation in its critical region -- if so, we pop the current
1387# activation and restart the handler using the previous one.
1388*/
1389ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1390 CFI_STARTPROC
1391/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1392 see the correct pointer to the pt_regs */
1393 movq %rdi, %rsp # we don't return, adjust the stack frame
1394 CFI_ENDPROC
1395 CFI_DEFAULT_STACK
139611: incl %gs:pda_irqcount
1397 movq %rsp,%rbp
1398 CFI_DEF_CFA_REGISTER rbp
1399 cmovzq %gs:pda_irqstackptr,%rsp
1400 pushq %rbp # backlink for old unwinder
1401 call xen_evtchn_do_upcall
1402 popq %rsp
1403 CFI_DEF_CFA_REGISTER rsp
1404 decl %gs:pda_irqcount
1405 jmp error_exit
1406 CFI_ENDPROC
1407END(do_hypervisor_callback)
1408
1409/*
1410# Hypervisor uses this for application faults while it executes.
1411# We get here for two reasons:
1412# 1. Fault while reloading DS, ES, FS or GS
1413# 2. Fault while executing IRET
1414# Category 1 we do not need to fix up as Xen has already reloaded all segment
1415# registers that could be reloaded and zeroed the others.
1416# Category 2 we fix up by killing the current process. We cannot use the
1417# normal Linux return path in this case because if we use the IRET hypercall
1418# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1419# We distinguish between categories by comparing each saved segment register
1420# with its current contents: any discrepancy means we in category 1.
1421*/
1422ENTRY(xen_failsafe_callback)
1423 framesz = (RIP-0x30) /* workaround buggy gas */
1424 _frame framesz
1425 CFI_REL_OFFSET rcx, 0
1426 CFI_REL_OFFSET r11, 8
1427 movw %ds,%cx
1428 cmpw %cx,0x10(%rsp)
1429 CFI_REMEMBER_STATE
1430 jne 1f
1431 movw %es,%cx
1432 cmpw %cx,0x18(%rsp)
1433 jne 1f
1434 movw %fs,%cx
1435 cmpw %cx,0x20(%rsp)
1436 jne 1f
1437 movw %gs,%cx
1438 cmpw %cx,0x28(%rsp)
1439 jne 1f
1440 /* All segments match their saved values => Category 2 (Bad IRET). */
1441 movq (%rsp),%rcx
1442 CFI_RESTORE rcx
1443 movq 8(%rsp),%r11
1444 CFI_RESTORE r11
1445 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 pushq %r11
1450 CFI_ADJUST_CFA_OFFSET 8
1451 pushq %rcx
1452 CFI_ADJUST_CFA_OFFSET 8
1453 jmp general_protection
1454 CFI_RESTORE_STATE
14551: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1456 movq (%rsp),%rcx
1457 CFI_RESTORE rcx
1458 movq 8(%rsp),%r11
1459 CFI_RESTORE r11
1460 addq $0x30,%rsp
1461 CFI_ADJUST_CFA_OFFSET -0x30
1462 pushq $0
1463 CFI_ADJUST_CFA_OFFSET 8
1464 SAVE_ALL
1465 jmp error_exit
1466 CFI_ENDPROC
1467END(xen_failsafe_callback)
1468
1469#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..786548a62d38 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 168 * May as well be the first.
169 */ 169 */
170 cpu = first_cpu(cpumask); 170 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 171 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 172 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 173 else
174 return BAD_APICID; 174 return BAD_APICID;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b06..2cfcbded888a 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -24,6 +24,7 @@
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
27 28
28DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
29EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
40short uv_possible_blades; 41short uv_possible_blades;
41EXPORT_SYMBOL_GPL(uv_possible_blades); 42EXPORT_SYMBOL_GPL(uv_possible_blades);
42 43
44unsigned long sn_rtc_cycles_per_second;
45EXPORT_SYMBOL(sn_rtc_cycles_per_second);
46
43/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 47/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
44 48
45static cpumask_t uv_target_cpus(void) 49static cpumask_t uv_target_cpus(void)
@@ -94,7 +98,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector)
94{ 98{
95 unsigned int cpu; 99 unsigned int cpu;
96 100
97 for (cpu = 0; cpu < NR_CPUS; ++cpu) 101 for_each_possible_cpu(cpu)
98 if (cpu_isset(cpu, mask)) 102 if (cpu_isset(cpu, mask))
99 uv_send_IPI_one(cpu, vector); 103 uv_send_IPI_one(cpu, vector);
100} 104}
@@ -128,7 +132,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
128 * May as well be the first. 132 * May as well be the first.
129 */ 133 */
130 cpu = first_cpu(cpumask); 134 cpu = first_cpu(cpumask);
131 if ((unsigned)cpu < NR_CPUS) 135 if ((unsigned)cpu < nr_cpu_ids)
132 return per_cpu(x86_cpu_to_apicid, cpu); 136 return per_cpu(x86_cpu_to_apicid, cpu);
133 else 137 else
134 return BAD_APICID; 138 return BAD_APICID;
@@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc); 276 map_high("MMIOH", mmioh.s.base, shift, map_uc);
273} 277}
274 278
279static __init void uv_rtc_init(void)
280{
281 long status, ticks_per_sec, drift;
282
283 status =
284 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec,
285 &drift);
286 if (status != 0 || ticks_per_sec < 100000) {
287 printk(KERN_WARNING
288 "unable to determine platform RTC clock frequency, "
289 "guessing.\n");
290 /* BIOS gives wrong value for clock freq. so guess */
291 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
292 } else
293 sn_rtc_cycles_per_second = ticks_per_sec;
294}
295
275static __init void uv_system_init(void) 296static __init void uv_system_init(void)
276{ 297{
277 union uvh_si_addr_map_config_u m_n_config; 298 union uvh_si_addr_map_config_u m_n_config;
@@ -326,6 +347,8 @@ static __init void uv_system_init(void)
326 gnode_upper = (((unsigned long)node_id.s.node_id) & 347 gnode_upper = (((unsigned long)node_id.s.node_id) &
327 ~((1 << n_val) - 1)) << m_val; 348 ~((1 << n_val) - 1)) << m_val;
328 349
350 uv_rtc_init();
351
329 for_each_present_cpu(cpu) { 352 for_each_present_cpu(cpu) {
330 nid = cpu_to_node(cpu); 353 nid = cpu_to_node(cpu);
331 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 354 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..1b318e903bf6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
102 109
103 early_printk("Kernel alive\n"); 110 early_printk("Kernel alive\n");
104 111
105 _cpu_pda = __cpu_pda; 112 x86_64_init_pda();
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 early_printk("Kernel really alive\n");
110 115
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..db3280afe886 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0ea6a19bfdfe..ad2b15a1334d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -468,7 +468,7 @@ void hpet_disable(void)
468#define RTC_NUM_INTS 1 468#define RTC_NUM_INTS 1
469 469
470static unsigned long hpet_rtc_flags; 470static unsigned long hpet_rtc_flags;
471static unsigned long hpet_prev_update_sec; 471static int hpet_prev_update_sec;
472static struct rtc_time hpet_alarm_time; 472static struct rtc_time hpet_alarm_time;
473static unsigned long hpet_pie_count; 473static unsigned long hpet_pie_count;
474static unsigned long hpet_t1_cmp; 474static unsigned long hpet_t1_cmp;
@@ -575,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
575 575
576 hpet_rtc_flags |= bit_mask; 576 hpet_rtc_flags |= bit_mask;
577 577
578 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
579 hpet_prev_update_sec = -1;
580
578 if (!oldbits) 581 if (!oldbits)
579 hpet_rtc_timer_init(); 582 hpet_rtc_timer_init();
580 583
@@ -652,7 +655,7 @@ static void hpet_rtc_timer_reinit(void)
652 if (hpet_rtc_flags & RTC_PIE) 655 if (hpet_rtc_flags & RTC_PIE)
653 hpet_pie_count += lost_ints; 656 hpet_pie_count += lost_ints;
654 if (printk_ratelimit()) 657 if (printk_ratelimit())
655 printk(KERN_WARNING "rtc: lost %d interrupts\n", 658 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
656 lost_ints); 659 lost_ints);
657 } 660 }
658} 661}
@@ -670,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
670 673
671 if (hpet_rtc_flags & RTC_UIE && 674 if (hpet_rtc_flags & RTC_UIE &&
672 curr_time.tm_sec != hpet_prev_update_sec) { 675 curr_time.tm_sec != hpet_prev_update_sec) {
673 rtc_int_flag = RTC_UF; 676 if (hpet_prev_update_sec >= 0)
677 rtc_int_flag = RTC_UF;
674 hpet_prev_update_sec = curr_time.tm_sec; 678 hpet_prev_update_sec = curr_time.tm_sec;
675 } 679 }
676 680
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 558abf4c796a..de9aa0e3a9c5 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -756,7 +756,7 @@ void send_IPI_self(int vector)
756 /* 756 /*
757 * Send the IPI. The write to APIC_ICR fires this off. 757 * Send the IPI. The write to APIC_ICR fires this off.
758 */ 758 */
759 apic_write_around(APIC_ICR, cfg); 759 apic_write(APIC_ICR, cfg);
760} 760}
761#endif /* !CONFIG_SMP */ 761#endif /* !CONFIG_SMP */
762 762
@@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq)
2030 unsigned long v; 2030 unsigned long v;
2031 2031
2032 v = apic_read(APIC_LVT0); 2032 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 2033 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2034} 2034}
2035 2035
2036static void unmask_lapic_irq(unsigned int irq) 2036static void unmask_lapic_irq(unsigned int irq)
@@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq)
2038 unsigned long v; 2038 unsigned long v;
2039 2039
2040 v = apic_read(APIC_LVT0); 2040 v = apic_read(APIC_LVT0);
2041 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); 2041 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2042} 2042}
2043 2043
2044static struct irq_chip lapic_chip __read_mostly = { 2044static struct irq_chip lapic_chip __read_mostly = {
@@ -2168,7 +2168,7 @@ static inline void __init check_timer(void)
2168 * The AEOI mode will finish them in the 8259A 2168 * The AEOI mode will finish them in the 8259A
2169 * automatically. 2169 * automatically.
2170 */ 2170 */
2171 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2171 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2172 init_8259A(1); 2172 init_8259A(1);
2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); 2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2174 2174
@@ -2177,8 +2177,9 @@ static inline void __init check_timer(void)
2177 pin2 = ioapic_i8259.pin; 2177 pin2 = ioapic_i8259.pin;
2178 apic2 = ioapic_i8259.apic; 2178 apic2 = ioapic_i8259.apic;
2179 2179
2180 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2180 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2181 vector, apic1, pin1, apic2, pin2); 2181 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2182 vector, apic1, pin1, apic2, pin2);
2182 2183
2183 /* 2184 /*
2184 * Some BIOS writers are clueless and report the ExtINTA 2185 * Some BIOS writers are clueless and report the ExtINTA
@@ -2216,12 +2217,13 @@ static inline void __init check_timer(void)
2216 } 2217 }
2217 clear_IO_APIC_pin(apic1, pin1); 2218 clear_IO_APIC_pin(apic1, pin1);
2218 if (!no_pin1) 2219 if (!no_pin1)
2219 printk(KERN_ERR "..MP-BIOS bug: " 2220 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2220 "8254 timer not connected to IO-APIC\n"); 2221 "8254 timer not connected to IO-APIC\n");
2221 2222
2222 printk(KERN_INFO "...trying to set up timer (IRQ0) " 2223 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2223 "through the 8259A ... "); 2224 "(IRQ0) through the 8259A ...\n");
2224 printk("\n..... (found pin %d) ...", pin2); 2225 apic_printk(APIC_QUIET, KERN_INFO
2226 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2225 /* 2227 /*
2226 * legacy devices should be connected to IO APIC #0 2228 * legacy devices should be connected to IO APIC #0
2227 */ 2229 */
@@ -2230,7 +2232,7 @@ static inline void __init check_timer(void)
2230 unmask_IO_APIC_irq(0); 2232 unmask_IO_APIC_irq(0);
2231 enable_8259A_irq(0); 2233 enable_8259A_irq(0);
2232 if (timer_irq_works()) { 2234 if (timer_irq_works()) {
2233 printk("works.\n"); 2235 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2234 timer_through_8259 = 1; 2236 timer_through_8259 = 1;
2235 if (nmi_watchdog == NMI_IO_APIC) { 2237 if (nmi_watchdog == NMI_IO_APIC) {
2236 disable_8259A_irq(0); 2238 disable_8259A_irq(0);
@@ -2244,44 +2246,47 @@ static inline void __init check_timer(void)
2244 */ 2246 */
2245 disable_8259A_irq(0); 2247 disable_8259A_irq(0);
2246 clear_IO_APIC_pin(apic2, pin2); 2248 clear_IO_APIC_pin(apic2, pin2);
2247 printk(" failed.\n"); 2249 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2248 } 2250 }
2249 2251
2250 if (nmi_watchdog == NMI_IO_APIC) { 2252 if (nmi_watchdog == NMI_IO_APIC) {
2251 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2253 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2254 "through the IO-APIC - disabling NMI Watchdog!\n");
2252 nmi_watchdog = NMI_NONE; 2255 nmi_watchdog = NMI_NONE;
2253 } 2256 }
2254 timer_ack = 0; 2257 timer_ack = 0;
2255 2258
2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2259 apic_printk(APIC_QUIET, KERN_INFO
2260 "...trying to set up timer as Virtual Wire IRQ...\n");
2257 2261
2258 lapic_register_intr(0, vector); 2262 lapic_register_intr(0, vector);
2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2263 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0); 2264 enable_8259A_irq(0);
2261 2265
2262 if (timer_irq_works()) { 2266 if (timer_irq_works()) {
2263 printk(" works.\n"); 2267 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2264 goto out; 2268 goto out;
2265 } 2269 }
2266 disable_8259A_irq(0); 2270 disable_8259A_irq(0);
2267 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); 2271 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 printk(" failed.\n"); 2272 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2269 2273
2270 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2274 apic_printk(APIC_QUIET, KERN_INFO
2275 "...trying to set up timer as ExtINT IRQ...\n");
2271 2276
2272 init_8259A(0); 2277 init_8259A(0);
2273 make_8259A_irq(0); 2278 make_8259A_irq(0);
2274 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 2279 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2275 2280
2276 unlock_ExtINT_logic(); 2281 unlock_ExtINT_logic();
2277 2282
2278 if (timer_irq_works()) { 2283 if (timer_irq_works()) {
2279 printk(" works.\n"); 2284 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2280 goto out; 2285 goto out;
2281 } 2286 }
2282 printk(" failed :(.\n"); 2287 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2283 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2288 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2284 "report. Then try booting with the 'noapic' option"); 2289 "report. Then try booting with the 'noapic' option.\n");
2285out: 2290out:
2286 local_irq_restore(flags); 2291 local_irq_restore(flags);
2287} 2292}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 6510cde36b35..8269434d1707 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -45,6 +45,7 @@
45#include <asm/proto.h> 45#include <asm/proto.h>
46#include <asm/acpi.h> 46#include <asm/acpi.h>
47#include <asm/dma.h> 47#include <asm/dma.h>
48#include <asm/i8259.h>
48#include <asm/nmi.h> 49#include <asm/nmi.h>
49#include <asm/msidef.h> 50#include <asm/msidef.h>
50#include <asm/hypertransport.h> 51#include <asm/hypertransport.h>
@@ -731,7 +732,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
731 return 0; 732 return 0;
732 } 733 }
733 734
734 for_each_cpu_mask(cpu, mask) { 735 for_each_cpu_mask_nr(cpu, mask) {
735 cpumask_t domain, new_mask; 736 cpumask_t domain, new_mask;
736 int new_cpu; 737 int new_cpu;
737 int vector, offset; 738 int vector, offset;
@@ -752,7 +753,7 @@ next:
752 continue; 753 continue;
753 if (vector == IA32_SYSCALL_VECTOR) 754 if (vector == IA32_SYSCALL_VECTOR)
754 goto next; 755 goto next;
755 for_each_cpu_mask(new_cpu, new_mask) 756 for_each_cpu_mask_nr(new_cpu, new_mask)
756 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 757 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
757 goto next; 758 goto next;
758 /* Found one! */ 759 /* Found one! */
@@ -762,7 +763,7 @@ next:
762 cfg->move_in_progress = 1; 763 cfg->move_in_progress = 1;
763 cfg->old_domain = cfg->domain; 764 cfg->old_domain = cfg->domain;
764 } 765 }
765 for_each_cpu_mask(new_cpu, new_mask) 766 for_each_cpu_mask_nr(new_cpu, new_mask)
766 per_cpu(vector_irq, new_cpu)[vector] = irq; 767 per_cpu(vector_irq, new_cpu)[vector] = irq;
767 cfg->vector = vector; 768 cfg->vector = vector;
768 cfg->domain = domain; 769 cfg->domain = domain;
@@ -794,7 +795,7 @@ static void __clear_irq_vector(int irq)
794 795
795 vector = cfg->vector; 796 vector = cfg->vector;
796 cpus_and(mask, cfg->domain, cpu_online_map); 797 cpus_and(mask, cfg->domain, cpu_online_map);
797 for_each_cpu_mask(cpu, mask) 798 for_each_cpu_mask_nr(cpu, mask)
798 per_cpu(vector_irq, cpu)[vector] = -1; 799 per_cpu(vector_irq, cpu)[vector] = -1;
799 800
800 cfg->vector = 0; 801 cfg->vector = 0;
@@ -1372,12 +1373,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1372static int ioapic_retrigger_irq(unsigned int irq) 1373static int ioapic_retrigger_irq(unsigned int irq)
1373{ 1374{
1374 struct irq_cfg *cfg = &irq_cfg[irq]; 1375 struct irq_cfg *cfg = &irq_cfg[irq];
1375 cpumask_t mask;
1376 unsigned long flags; 1376 unsigned long flags;
1377 1377
1378 spin_lock_irqsave(&vector_lock, flags); 1378 spin_lock_irqsave(&vector_lock, flags);
1379 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 1379 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1380 send_IPI_mask(mask, cfg->vector);
1381 spin_unlock_irqrestore(&vector_lock, flags); 1380 spin_unlock_irqrestore(&vector_lock, flags);
1382 1381
1383 return 1; 1382 return 1;
@@ -1696,8 +1695,9 @@ static inline void __init check_timer(void)
1696 pin2 = ioapic_i8259.pin; 1695 pin2 = ioapic_i8259.pin;
1697 apic2 = ioapic_i8259.apic; 1696 apic2 = ioapic_i8259.apic;
1698 1697
1699 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 1698 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1700 cfg->vector, apic1, pin1, apic2, pin2); 1699 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
1700 cfg->vector, apic1, pin1, apic2, pin2);
1701 1701
1702 /* 1702 /*
1703 * Some BIOS writers are clueless and report the ExtINTA 1703 * Some BIOS writers are clueless and report the ExtINTA
@@ -1735,14 +1735,13 @@ static inline void __init check_timer(void)
1735 } 1735 }
1736 clear_IO_APIC_pin(apic1, pin1); 1736 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 1737 if (!no_pin1)
1738 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " 1738 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1739 "8254 timer not connected to IO-APIC\n"); 1739 "8254 timer not connected to IO-APIC\n");
1740 1740
1741 apic_printk(APIC_VERBOSE,KERN_INFO 1741 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1742 "...trying to set up timer (IRQ0) " 1742 "(IRQ0) through the 8259A ...\n");
1743 "through the 8259A ... "); 1743 apic_printk(APIC_QUIET, KERN_INFO
1744 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 1744 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1745 apic2, pin2);
1746 /* 1745 /*
1747 * legacy devices should be connected to IO APIC #0 1746 * legacy devices should be connected to IO APIC #0
1748 */ 1747 */
@@ -1751,7 +1750,7 @@ static inline void __init check_timer(void)
1751 unmask_IO_APIC_irq(0); 1750 unmask_IO_APIC_irq(0);
1752 enable_8259A_irq(0); 1751 enable_8259A_irq(0);
1753 if (timer_irq_works()) { 1752 if (timer_irq_works()) {
1754 apic_printk(APIC_VERBOSE," works.\n"); 1753 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1755 timer_through_8259 = 1; 1754 timer_through_8259 = 1;
1756 if (nmi_watchdog == NMI_IO_APIC) { 1755 if (nmi_watchdog == NMI_IO_APIC) {
1757 disable_8259A_irq(0); 1756 disable_8259A_irq(0);
@@ -1765,29 +1764,32 @@ static inline void __init check_timer(void)
1765 */ 1764 */
1766 disable_8259A_irq(0); 1765 disable_8259A_irq(0);
1767 clear_IO_APIC_pin(apic2, pin2); 1766 clear_IO_APIC_pin(apic2, pin2);
1768 apic_printk(APIC_VERBOSE," failed.\n"); 1767 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1769 } 1768 }
1770 1769
1771 if (nmi_watchdog == NMI_IO_APIC) { 1770 if (nmi_watchdog == NMI_IO_APIC) {
1772 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 1771 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
1772 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE; 1773 nmi_watchdog = NMI_NONE;
1774 } 1774 }
1775 1775
1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 1776 apic_printk(APIC_QUIET, KERN_INFO
1777 "...trying to set up timer as Virtual Wire IRQ...\n");
1777 1778
1778 lapic_register_intr(0); 1779 lapic_register_intr(0);
1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 1780 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1780 enable_8259A_irq(0); 1781 enable_8259A_irq(0);
1781 1782
1782 if (timer_irq_works()) { 1783 if (timer_irq_works()) {
1783 apic_printk(APIC_VERBOSE," works.\n"); 1784 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1784 goto out; 1785 goto out;
1785 } 1786 }
1786 disable_8259A_irq(0); 1787 disable_8259A_irq(0);
1787 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 1788 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1788 apic_printk(APIC_VERBOSE," failed.\n"); 1789 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1789 1790
1790 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 1791 apic_printk(APIC_QUIET, KERN_INFO
1792 "...trying to set up timer as ExtINT IRQ...\n");
1791 1793
1792 init_8259A(0); 1794 init_8259A(0);
1793 make_8259A_irq(0); 1795 make_8259A_irq(0);
@@ -1796,11 +1798,12 @@ static inline void __init check_timer(void)
1796 unlock_ExtINT_logic(); 1798 unlock_ExtINT_logic();
1797 1799
1798 if (timer_irq_works()) { 1800 if (timer_irq_works()) {
1799 apic_printk(APIC_VERBOSE," works.\n"); 1801 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1800 goto out; 1802 goto out;
1801 } 1803 }
1802 apic_printk(APIC_VERBOSE," failed :(.\n"); 1804 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1803 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 1805 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
1806 "report. Then try booting with the 'noapic' option.\n");
1804out: 1807out:
1805 local_irq_restore(flags); 1808 local_irq_restore(flags);
1806} 1809}
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..1c3a66a67f83 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -103,6 +103,9 @@ void __init io_delay_init(void)
103 103
104static int __init io_delay_param(char *s) 104static int __init io_delay_param(char *s)
105{ 105{
106 if (!s)
107 return -EINVAL;
108
106 if (!strcmp(s, "0x80")) 109 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 110 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 111 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad9..3f7537b669d3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
70 /* 70 /*
71 * Send the IPI. The write to APIC_ICR fires this off. 71 * Send the IPI. The write to APIC_ICR fires this off.
72 */ 72 */
73 apic_write_around(APIC_ICR, cfg); 73 apic_write(APIC_ICR, cfg);
74} 74}
75 75
76void send_IPI_self(int vector) 76void send_IPI_self(int vector)
@@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
98 * prepare target chip field 98 * prepare target chip field
99 */ 99 */
100 cfg = __prepare_ICR2(mask); 100 cfg = __prepare_ICR2(mask);
101 apic_write_around(APIC_ICR2, cfg); 101 apic_write(APIC_ICR2, cfg);
102 102
103 /* 103 /*
104 * program the ICR 104 * program the ICR
@@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
108 /* 108 /*
109 * Send the IPI. The write to APIC_ICR fires this off. 109 * Send the IPI. The write to APIC_ICR fires this off.
110 */ 110 */
111 apic_write_around(APIC_ICR, cfg); 111 apic_write(APIC_ICR, cfg);
112} 112}
113 113
114/* 114/*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..1cf8c1fcc088 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 85
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 86static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 87static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 88
92static void call_on_stack(void *func, void *stack) 89static void call_on_stack(void *func, void *stack)
93{ 90{
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0373e88de95a..1f26fd9ec4f4 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -43,10 +43,11 @@
43 43
44#define BUILD_IRQ(nr) \ 44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \ 45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.p2align\n" \ 46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \ 47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \ 48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt"); 49 "jmp common_interrupt\n" \
50 ".previous");
50 51
51#define BI(x,y) \ 52#define BI(x,y) \
52 BUILD_IRQ(x##y) 53 BUILD_IRQ(x##y)
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..f2d43bc75514 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void)
209{ 213{
210 int error = 0; 214 int error = 0;
211 215
216 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
217 if (!arch_debugfs_dir)
218 return -ENOMEM;
219
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 220#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 221 error = boot_params_kdebugfs_init();
214#endif 222#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 858
861 resume_execution(cur, regs, kcb); 859 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 860 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 861
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 862 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 863 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..d02def06ca91 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 113#endif
114 114
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 116static void __init kvm_smp_prepare_boot_cpu(void)
117{ 117{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 118 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 119 native_smp_prepare_boot_cpu();
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a8449571858a..3fee2aa50f3f 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -62,12 +62,12 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 62
63 if (reload) { 63 if (reload) {
64#ifdef CONFIG_SMP 64#ifdef CONFIG_SMP
65 cpumask_t mask; 65 cpumask_of_cpu_ptr_declare(mask);
66 66
67 preempt_disable(); 67 preempt_disable();
68 load_LDT(pc); 68 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 69 cpumask_of_cpu_ptr_next(mask, smp_processor_id());
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 70 if (!cpus_equal(current->mm->cpu_vm_mask, *mask))
71 smp_call_function(flush_ldt, current->mm, 1); 71 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 72 preempt_enable();
73#else 73#else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..9fe478d98406 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -22,6 +22,7 @@
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/desc.h> 23#include <asm/desc.h>
24#include <asm/system.h> 24#include <asm/system.h>
25#include <asm/cacheflush.h>
25 26
26#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 27#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
27static u32 kexec_pgd[1024] PAGE_ALIGNED; 28static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -85,10 +86,12 @@ static void load_segments(void)
85 * reboot code buffer to allow us to avoid allocations 86 * reboot code buffer to allow us to avoid allocations
86 * later. 87 * later.
87 * 88 *
88 * Currently nothing. 89 * Make control page executable.
89 */ 90 */
90int machine_kexec_prepare(struct kimage *image) 91int machine_kexec_prepare(struct kimage *image)
91{ 92{
93 if (nx_enabled)
94 set_pages_x(image->control_code_page, 1);
92 return 0; 95 return 0;
93} 96}
94 97
@@ -98,27 +101,48 @@ int machine_kexec_prepare(struct kimage *image)
98 */ 101 */
99void machine_kexec_cleanup(struct kimage *image) 102void machine_kexec_cleanup(struct kimage *image)
100{ 103{
104 if (nx_enabled)
105 set_pages_nx(image->control_code_page, 1);
101} 106}
102 107
103/* 108/*
104 * Do not allocate memory (or fail in any way) in machine_kexec(). 109 * Do not allocate memory (or fail in any way) in machine_kexec().
105 * We are past the point of no return, committed to rebooting now. 110 * We are past the point of no return, committed to rebooting now.
106 */ 111 */
107NORET_TYPE void machine_kexec(struct kimage *image) 112void machine_kexec(struct kimage *image)
108{ 113{
109 unsigned long page_list[PAGES_NR]; 114 unsigned long page_list[PAGES_NR];
110 void *control_page; 115 void *control_page;
116 asmlinkage unsigned long
117 (*relocate_kernel_ptr)(unsigned long indirection_page,
118 unsigned long control_page,
119 unsigned long start_address,
120 unsigned int has_pae,
121 unsigned int preserve_context);
111 122
112 tracer_disable(); 123 tracer_disable();
113 124
114 /* Interrupts aren't acceptable while we reboot */ 125 /* Interrupts aren't acceptable while we reboot */
115 local_irq_disable(); 126 local_irq_disable();
116 127
128 if (image->preserve_context) {
129#ifdef CONFIG_X86_IO_APIC
130 /* We need to put APICs in legacy mode so that we can
131 * get timer interrupts in second kernel. kexec/kdump
132 * paths already have calls to disable_IO_APIC() in
133 * one form or other. kexec jump path also need
134 * one.
135 */
136 disable_IO_APIC();
137#endif
138 }
139
117 control_page = page_address(image->control_code_page); 140 control_page = page_address(image->control_code_page);
118 memcpy(control_page, relocate_kernel, PAGE_SIZE); 141 memcpy(control_page, relocate_kernel, PAGE_SIZE/2);
119 142
143 relocate_kernel_ptr = control_page;
120 page_list[PA_CONTROL_PAGE] = __pa(control_page); 144 page_list[PA_CONTROL_PAGE] = __pa(control_page);
121 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 145 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
122 page_list[PA_PGD] = __pa(kexec_pgd); 146 page_list[PA_PGD] = __pa(kexec_pgd);
123 page_list[VA_PGD] = (unsigned long)kexec_pgd; 147 page_list[VA_PGD] = (unsigned long)kexec_pgd;
124#ifdef CONFIG_X86_PAE 148#ifdef CONFIG_X86_PAE
@@ -131,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
131 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 155 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
132 page_list[PA_PTE_1] = __pa(kexec_pte1); 156 page_list[PA_PTE_1] = __pa(kexec_pte1);
133 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 157 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
158 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
134 159
135 /* The segment registers are funny things, they have both a 160 /* The segment registers are funny things, they have both a
136 * visible and an invisible part. Whenever the visible part is 161 * visible and an invisible part. Whenever the visible part is
@@ -149,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image)
149 set_idt(phys_to_virt(0),0); 174 set_idt(phys_to_virt(0),0);
150 175
151 /* now call it */ 176 /* now call it */
152 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 177 image->start = relocate_kernel_ptr((unsigned long)image->head,
153 image->start, cpu_has_pae); 178 (unsigned long)page_list,
179 image->start, cpu_has_pae,
180 image->preserve_context);
154} 181}
155 182
156void arch_crash_save_vmcoreinfo(void) 183void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
181 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
182 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
183 */ 183 */
184NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
185{ 185{
186 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
187 void *control_page; 187 void *control_page;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 56b933119a04..6994c751590e 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -388,6 +388,7 @@ static int do_microcode_update (void)
388 void *new_mc = NULL; 388 void *new_mc = NULL;
389 int cpu; 389 int cpu;
390 cpumask_t old; 390 cpumask_t old;
391 cpumask_of_cpu_ptr_declare(newmask);
391 392
392 old = current->cpus_allowed; 393 old = current->cpus_allowed;
393 394
@@ -404,7 +405,8 @@ static int do_microcode_update (void)
404 405
405 if (!uci->valid) 406 if (!uci->valid)
406 continue; 407 continue;
407 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 408 cpumask_of_cpu_ptr_next(newmask, cpu);
409 set_cpus_allowed_ptr(current, newmask);
408 error = get_maching_microcode(new_mc, cpu); 410 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0) 411 if (error < 0)
410 goto out; 412 goto out;
@@ -574,6 +576,7 @@ static int apply_microcode_check_cpu(int cpu)
574 struct cpuinfo_x86 *c = &cpu_data(cpu); 576 struct cpuinfo_x86 *c = &cpu_data(cpu);
575 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 577 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
576 cpumask_t old; 578 cpumask_t old;
579 cpumask_of_cpu_ptr(newmask, cpu);
577 unsigned int val[2]; 580 unsigned int val[2];
578 int err = 0; 581 int err = 0;
579 582
@@ -582,7 +585,7 @@ static int apply_microcode_check_cpu(int cpu)
582 return 0; 585 return 0;
583 586
584 old = current->cpus_allowed; 587 old = current->cpus_allowed;
585 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 588 set_cpus_allowed_ptr(current, newmask);
586 589
587 /* Check if the microcode we have in memory matches the CPU */ 590 /* Check if the microcode we have in memory matches the CPU */
588 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 591 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
@@ -620,11 +623,12 @@ static int apply_microcode_check_cpu(int cpu)
620static void microcode_init_cpu(int cpu, int resume) 623static void microcode_init_cpu(int cpu, int resume)
621{ 624{
622 cpumask_t old; 625 cpumask_t old;
626 cpumask_of_cpu_ptr(newmask, cpu);
623 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 627 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
624 628
625 old = current->cpus_allowed; 629 old = current->cpus_allowed;
626 630
627 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 631 set_cpus_allowed_ptr(current, newmask);
628 mutex_lock(&microcode_mutex); 632 mutex_lock(&microcode_mutex);
629 collect_cpu_info(cpu); 633 collect_cpu_info(cpu);
630 if (uci->valid && system_state == SYSTEM_RUNNING && !resume) 634 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
@@ -644,7 +648,9 @@ static void microcode_fini_cpu(int cpu)
644 mutex_unlock(&microcode_mutex); 648 mutex_unlock(&microcode_mutex);
645} 649}
646 650
647static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) 651static ssize_t reload_store(struct sys_device *dev,
652 struct sysdev_attribute *attr,
653 const char *buf, size_t sz)
648{ 654{
649 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 655 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
650 char *end; 656 char *end;
@@ -656,11 +662,12 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
656 return -EINVAL; 662 return -EINVAL;
657 if (val == 1) { 663 if (val == 1) {
658 cpumask_t old; 664 cpumask_t old;
665 cpumask_of_cpu_ptr(newmask, cpu);
659 666
660 old = current->cpus_allowed; 667 old = current->cpus_allowed;
661 668
662 get_online_cpus(); 669 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); 670 set_cpus_allowed_ptr(current, newmask);
664 671
665 mutex_lock(&microcode_mutex); 672 mutex_lock(&microcode_mutex);
666 if (uci->valid) 673 if (uci->valid)
@@ -674,14 +681,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
674 return sz; 681 return sz;
675} 682}
676 683
677static ssize_t version_show(struct sys_device *dev, char *buf) 684static ssize_t version_show(struct sys_device *dev,
685 struct sysdev_attribute *attr, char *buf)
678{ 686{
679 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 687 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
680 688
681 return sprintf(buf, "0x%x\n", uci->rev); 689 return sprintf(buf, "0x%x\n", uci->rev);
682} 690}
683 691
684static ssize_t pf_show(struct sys_device *dev, char *buf) 692static ssize_t pf_show(struct sys_device *dev,
693 struct sysdev_attribute *attr, char *buf)
685{ 694{
686 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 695 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
687 696
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
151 struct module *me) 152 struct module *me)
152{ 153{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 156 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 157
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 158 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 162 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 164 locks= s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s;
163 } 167 }
164 168
165 if (alt) { 169 if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 179 tseg, tseg + text->sh_size);
176 } 180 }
177 181
182 if (para) {
183 void *pseg = (void *)para->sh_addr;
184 apply_paravirt(pseg, pseg + para->sh_size);
185 }
186
178 return module_bug_finalize(hdr, sechdrs, me); 187 return module_bug_finalize(hdr, sechdrs, me);
179} 188}
180 189
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c6..6ae005ccaed8 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
30#include <asm/setup.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -48,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len)
48 return sum & 0xFF; 49 return sum & 0xFF;
49} 50}
50 51
51#ifdef CONFIG_X86_NUMAQ
52int found_numaq;
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68
69static int mpc_record;
70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
119#endif
120
121static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 52static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
122{ 53{
123 int apicid; 54 int apicid;
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
127 disabled_cpus++; 58 disabled_cpus++;
128 return; 59 return;
129 } 60 }
130#ifdef CONFIG_X86_NUMAQ 61
131 if (found_numaq) 62 if (x86_quirks->mpc_apic_id)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]); 63 apicid = x86_quirks->mpc_apic_id(m);
133 else 64 else
134 apicid = m->mpc_apicid; 65 apicid = m->mpc_apicid;
135#else 66
136 apicid = m->mpc_apicid;
137#endif
138 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
139 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
140 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
151 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
152 str[6] = 0; 81 str[6] = 0;
153 82
154#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
155 if (found_numaq) 84 x86_quirks->mpc_oem_bus_info(m, str);
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 85 else
157#else 86 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
159#endif
160 87
161#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
162 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
174#endif 101#endif
175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
176#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
177 if (found_numaq) 104 x86_quirks->mpc_oem_pci_bus(m);
178 mpc_oem_pci_bus(m, translation_table[mpc_record]); 105
179#endif
180 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
181#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -316,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
316 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
317} 243}
318 244
319#ifdef CONFIG_X86_NUMAQ
320static void __init MP_translation_info(struct mpc_config_translation *m)
321{
322 printk(KERN_INFO
323 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
324 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
325 m->trans_local);
326
327 if (mpc_record >= MAX_MPC_ENTRY)
328 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
329 else
330 translation_table[mpc_record] = m; /* stash this for later */
331 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
332 node_set_online(m->trans_quad);
333}
334
335/*
336 * Read/parse the MPC oem tables
337 */
338
339static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
340 unsigned short oemsize)
341{
342 int count = sizeof(*oemtable); /* the header size */
343 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
344
345 mpc_record = 0;
346 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
347 oemtable);
348 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
349 printk(KERN_WARNING
350 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
351 oemtable->oem_signature[0], oemtable->oem_signature[1],
352 oemtable->oem_signature[2], oemtable->oem_signature[3]);
353 return;
354 }
355 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
356 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
357 return;
358 }
359 while (count < oemtable->oem_length) {
360 switch (*oemptr) {
361 case MP_TRANSLATION:
362 {
363 struct mpc_config_translation *m =
364 (struct mpc_config_translation *)oemptr;
365 MP_translation_info(m);
366 oemptr += sizeof(*m);
367 count += sizeof(*m);
368 ++mpc_record;
369 break;
370 }
371 default:
372 {
373 printk(KERN_WARNING
374 "Unrecognised OEM table entry type! - %d\n",
375 (int)*oemptr);
376 return;
377 }
378 }
379 }
380}
381
382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
383 char *productid)
384{
385 if (strncmp(oem, "IBM NUMA", 8))
386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
390 if (mpc->mpc_oemptr)
391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
392 mpc->mpc_oemsize);
393}
394#endif /* CONFIG_X86_NUMAQ */
395
396/* 245/*
397 * Read/parse the MPC 246 * Read/parse the MPC
398 */ 247 */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
457 } else 306 } else
458 mps_oem_check(mpc, oem, str); 307 mps_oem_check(mpc, oem, str);
459#endif 308#endif
460
461 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
462 if (!acpi_lapic) 310 if (!acpi_lapic)
463 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
465 if (early) 313 if (early)
466 return 1; 314 return 1;
467 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
468 /* 321 /*
469 * Now process the configuration blocks. 322 * Now process the configuration blocks.
470 */ 323 */
471#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
472 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
473#endif 326
474 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
475 switch (*mpt) { 328 switch (*mpt) {
476 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -536,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
536 count = mpc->mpc_length; 389 count = mpc->mpc_length;
537 break; 390 break;
538 } 391 }
539#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
540 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
541#endif
542 } 394 }
543 395
544#ifdef CONFIG_X86_GENERICARCH 396#ifdef CONFIG_X86_GENERICARCH
@@ -726,20 +578,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 578static struct intel_mp_floating *mpf_found;
727 579
728/* 580/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
735 * Scan the memory blocks for an SMP configuration block. 581 * Scan the memory blocks for an SMP configuration block.
736 */ 582 */
737static void __init __get_smp_config(unsigned int early) 583static void __init __get_smp_config(unsigned int early)
738{ 584{
739 struct intel_mp_floating *mpf = mpf_found; 585 struct intel_mp_floating *mpf = mpf_found;
740 586
741 if (mach_get_smp_config_quirk) { 587 if (x86_quirks->mach_get_smp_config) {
742 if (mach_get_smp_config_quirk(early)) 588 if (x86_quirks->mach_get_smp_config(early))
743 return; 589 return;
744 } 590 }
745 if (acpi_lapic && early) 591 if (acpi_lapic && early)
@@ -899,14 +745,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
899 return 0; 745 return 0;
900} 746}
901 747
902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve) 748static void __init __find_smp_config(unsigned int reserve)
905{ 749{
906 unsigned int address; 750 unsigned int address;
907 751
908 if (mach_find_smp_config_quirk) { 752 if (x86_quirks->mach_find_smp_config) {
909 if (mach_find_smp_config_quirk(reserve)) 753 if (x86_quirks->mach_find_smp_config(reserve))
910 return; 754 return;
911 } 755 }
912 /* 756 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a153b3905f60..9fd809552447 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -149,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu)
149{ 149{
150 struct device *dev; 150 struct device *dev;
151 151
152 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 152 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
153 "msr%d", cpu); 153 NULL, "msr%d", cpu);
154 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 154 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
155} 155}
156 156
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad0..ac6d51222e7d 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs);
263 263
264static void __acpi_nmi_enable(void *__unused) 264static void __acpi_nmi_enable(void *__unused)
265{ 265{
266 apic_write_around(APIC_LVT0, APIC_DM_NMI); 266 apic_write(APIC_LVT0, APIC_DM_NMI);
267} 267}
268 268
269/* 269/*
@@ -277,7 +277,7 @@ void acpi_nmi_enable(void)
277 277
278static void __acpi_nmi_disable(void *__unused) 278static void __acpi_nmi_disable(void *__unused)
279{ 279{
280 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); 280 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
281} 281}
282 282
283/* 283/*
@@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
448 448
449#ifdef CONFIG_SYSCTL 449#ifdef CONFIG_SYSCTL
450 450
451static int __init setup_unknown_nmi_panic(char *str)
452{
453 unknown_nmi_panic = 1;
454 return 1;
455}
456__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
457
451static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 458static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
452{ 459{
453 unsigned char reason = get_nmi_reason(); 460 unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9ac..b8c45610b20a 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h>
36 37
37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
38 39
@@ -71,6 +72,188 @@ static void __init smp_dump_qct(void)
71 } 72 }
72} 73}
73 74
75
76void __init numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL,
235 .arch_pre_intr_init = NULL,
236 .arch_memory_setup = NULL,
237 .arch_intr_init = NULL,
238 .arch_trap_init = NULL,
239 .mach_get_smp_config = NULL,
240 .mach_find_smp_config = NULL,
241 .mpc_record = &mpc_record,
242 .mpc_apic_id = mpc_apic_id,
243 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem,
246};
247
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
249 char *productid)
250{
251 if (strncmp(oem, "IBM NUMA", 8))
252 printk("Warning! Not a NUMA-Q system!\n");
253 else
254 found_numaq = 1;
255}
256
74static __init void early_check_numaq(void) 257static __init void early_check_numaq(void)
75{ 258{
76 /* 259 /*
@@ -82,6 +265,9 @@ static __init void early_check_numaq(void)
82 */ 265 */
83 if (smp_found_config) 266 if (smp_found_config)
84 early_get_smp_config(); 267 early_get_smp_config();
268
269 if (found_numaq)
270 x86_quirks = &numaq_x86_quirks;
85} 271}
86 272
87int __init get_memcfg_numaq(void) 273int __init get_memcfg_numaq(void)
@@ -92,14 +278,3 @@ int __init get_memcfg_numaq(void)
92 smp_dump_qct(); 278 smp_dump_qct();
93 return 1; 279 return 1;
94} 280}
95
96void __init numaq_tsc_disable(void)
97{
98 if (!found_numaq)
99 return;
100
101 if (num_online_nodes() > 1) {
102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
103 setup_clear_cpu_cap(X86_FEATURE_TSC);
104 }
105}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..94da4d52d798 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
123 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
124 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
125 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
126 }; 128 };
127 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
128} 130}
@@ -266,6 +268,17 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
266 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
267} 269}
268 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
269struct pv_info pv_info = { 282struct pv_info pv_info = {
270 .name = "bare hardware", 283 .name = "bare hardware",
271 .paravirt_enabled = 0, 284 .paravirt_enabled = 0,
@@ -361,7 +374,6 @@ struct pv_cpu_ops pv_cpu_ops = {
361struct pv_apic_ops pv_apic_ops = { 374struct pv_apic_ops pv_apic_ops = {
362#ifdef CONFIG_X86_LOCAL_APIC 375#ifdef CONFIG_X86_LOCAL_APIC
363 .apic_write = native_apic_write, 376 .apic_write = native_apic_write,
364 .apic_write_atomic = native_apic_write_atomic,
365 .apic_read = native_apic_read, 377 .apic_read = native_apic_read,
366 .setup_boot_clock = setup_boot_APIC_clock, 378 .setup_boot_clock = setup_boot_APIC_clock,
367 .setup_secondary_clock = setup_secondary_APIC_clock, 379 .setup_secondary_clock = setup_secondary_APIC_clock,
@@ -373,6 +385,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 385#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 386 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 387 .pagetable_setup_done = native_pagetable_setup_done,
388#else
389 .pagetable_setup_start = paravirt_nop,
390 .pagetable_setup_done = paravirt_nop,
376#endif 391#endif
377 392
378 .read_cr2 = native_read_cr2, 393 .read_cr2 = native_read_cr2,
@@ -428,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
428#endif /* PAGETABLE_LEVELS >= 3 */ 443#endif /* PAGETABLE_LEVELS >= 3 */
429 444
430 .pte_val = native_pte_val, 445 .pte_val = native_pte_val,
431 .pte_flags = native_pte_val, 446 .pte_flags = native_pte_flags,
432 .pgd_val = native_pgd_val, 447 .pgd_val = native_pgd_val,
433 448
434 .make_pte = native_make_pte, 449 .make_pte = native_make_pte,
@@ -446,6 +461,18 @@ struct pv_mmu_ops pv_mmu_ops = {
446 .set_fixmap = native_set_fixmap, 461 .set_fixmap = native_set_fixmap,
447}; 462};
448 463
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
473};
474EXPORT_SYMBOL_GPL(pv_lock_ops);
475
449EXPORT_SYMBOL_GPL(pv_time_ops); 476EXPORT_SYMBOL_GPL(pv_time_ops);
450EXPORT_SYMBOL (pv_cpu_ops); 477EXPORT_SYMBOL (pv_cpu_ops);
451EXPORT_SYMBOL (pv_mmu_ops); 478EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df4..b67a4b1d4eae 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
39#include <asm/gart.h> 40
41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
42#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -410,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev,
410 } 414 }
411} 415}
412 416
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 417static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 418 int nelems, int direction)
431{ 419{
@@ -436,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 424 unsigned long entry;
437 int i; 425 int i;
438 426
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 427 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 428 BUG_ON(!sg_page(s));
444 429
@@ -474,7 +459,6 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 459static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 460 size_t size, int direction)
476{ 461{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 462 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 463 unsigned long uaddr;
480 unsigned int npages; 464 unsigned int npages;
@@ -483,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
483 uaddr = (unsigned long)vaddr; 467 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 468 npages = num_dma_pages(uaddr, size);
485 469
486 if (translation_enabled(tbl)) 470 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 471}
493 472
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 473static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 476 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 477 unsigned int npages;
499 478
500 if (!translation_enabled(tbl))
501 return;
502
503 npages = num_dma_pages(dma_handle, size); 479 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 480 iommu_free(tbl, dma_handle, npages);
505} 481}
@@ -522,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
522 goto error; 498 goto error;
523 memset(ret, 0, size); 499 memset(ret, 0, size);
524 500
525 if (translation_enabled(tbl)) { 501 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 502 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 503 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 504 goto free;
529 goto free; 505 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 506 return ret;
536
537free: 507free:
538 free_pages((unsigned long)ret, get_order(size)); 508 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 509 ret = NULL;
@@ -541,7 +511,7 @@ error:
541 return ret; 511 return ret;
542} 512}
543 513
544static const struct dma_mapping_ops calgary_dma_ops = { 514static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 515 .alloc_coherent = calgary_alloc_coherent,
546 .map_single = calgary_map_single, 516 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 517 .unmap_single = calgary_unmap_single,
@@ -830,7 +800,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 800
831 tbl = pci_iommu(dev->bus); 801 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 802 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 803
804 if (is_kdump_kernel())
805 calgary_init_bitmap_from_tce_table(tbl);
806 else
807 tce_free(tbl, 0, tbl->it_size);
834 808
835 if (is_calgary(dev->device)) 809 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 810 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1183,10 @@ static int __init calgary_init(void)
1209 if (ret) 1183 if (ret)
1210 return ret; 1184 return ret;
1211 1185
1186 /* Purely for kdump kernel case */
1187 if (is_kdump_kernel())
1188 get_tce_space_from_tar();
1189
1212 do { 1190 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1191 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1192 if (!dev)
@@ -1230,6 +1208,16 @@ static int __init calgary_init(void)
1230 goto error; 1208 goto error;
1231 } while (1); 1209 } while (1);
1232 1210
1211 dev = NULL;
1212 for_each_pci_dev(dev) {
1213 struct iommu_table *tbl;
1214
1215 tbl = find_iommu_table(&dev->dev);
1216
1217 if (translation_enabled(tbl))
1218 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1219 }
1220
1233 return ret; 1221 return ret;
1234 1222
1235error: 1223error:
@@ -1251,6 +1239,7 @@ error:
1251 calgary_disable_translation(dev); 1239 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1240 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1241 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1242 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1243 } while (1);
1255 1244
1256 return ret; 1245 return ret;
@@ -1339,6 +1328,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1328 return (val != 0xffffffff);
1340} 1329}
1341 1330
1331/*
1332 * calgary_init_bitmap_from_tce_table():
1333 * Funtion for kdump case. In the second/kdump kernel initialize
1334 * the bitmap based on the tce table entries obtained from first kernel
1335 */
1336static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1337{
1338 u64 *tp;
1339 unsigned int index;
1340 tp = ((u64 *)tbl->it_base);
1341 for (index = 0 ; index < tbl->it_size; index++) {
1342 if (*tp != 0x0)
1343 set_bit(index, tbl->it_map);
1344 tp++;
1345 }
1346}
1347
1348/*
1349 * get_tce_space_from_tar():
1350 * Function for kdump case. Get the tce tables from first kernel
1351 * by reading the contents of the base adress register of calgary iommu
1352 */
1353static void get_tce_space_from_tar()
1354{
1355 int bus;
1356 void __iomem *target;
1357 unsigned long tce_space;
1358
1359 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1360 struct calgary_bus_info *info = &bus_info[bus];
1361 unsigned short pci_device;
1362 u32 val;
1363
1364 val = read_pci_config(bus, 0, 0, 0);
1365 pci_device = (val & 0xFFFF0000) >> 16;
1366
1367 if (!is_cal_pci_dev(pci_device))
1368 continue;
1369 if (info->translation_disabled)
1370 continue;
1371
1372 if (calgary_bus_has_devices(bus, pci_device) ||
1373 translate_empty_slots) {
1374 target = calgary_reg(bus_info[bus].bbar,
1375 tar_offset(bus));
1376 tce_space = be64_to_cpu(readq(target));
1377 tce_space = tce_space & TAR_SW_BITS;
1378
1379 tce_space = tce_space & (~specified_table_size);
1380 info->tce_space = (u64 *)__va(tce_space);
1381 }
1382 }
1383 return;
1384}
1385
1342void __init detect_calgary(void) 1386void __init detect_calgary(void)
1343{ 1387{
1344 int bus; 1388 int bus;
@@ -1394,7 +1438,8 @@ void __init detect_calgary(void)
1394 return; 1438 return;
1395 } 1439 }
1396 1440
1397 specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); 1441 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1442 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1443
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1444 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1445 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1457,16 @@ void __init detect_calgary(void)
1412 1457
1413 if (calgary_bus_has_devices(bus, pci_device) || 1458 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1459 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1460 /*
1416 if (!tbl) 1461 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1462 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1463 */
1464 if (!is_kdump_kernel()) {
1465 tbl = alloc_tce_table();
1466 if (!tbl)
1467 goto cleanup;
1468 info->tce_space = tbl;
1469 }
1419 calgary_found = 1; 1470 calgary_found = 1;
1420 } 1471 }
1421 } 1472 }
@@ -1430,6 +1481,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1481 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1482 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1483 debugging ? "enabled" : "disabled");
1484
1485 /* swiotlb for devices that aren't behind the Calgary. */
1486 if (max_pfn > MAX_DMA32_PFN)
1487 swiotlb = 1;
1433 } 1488 }
1434 return; 1489 return;
1435 1490
@@ -1446,7 +1501,7 @@ int __init calgary_iommu_init(void)
1446{ 1501{
1447 int ret; 1502 int ret;
1448 1503
1449 if (no_iommu || swiotlb) 1504 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1505 return -ENODEV;
1451 1506
1452 if (!calgary_detected) 1507 if (!calgary_detected)
@@ -1459,15 +1514,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1514 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1515 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1516 "falling back to no_iommu\n", ret);
1462 if (max_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1517 return ret;
1466 } 1518 }
1467 1519
1468 force_iommu = 1; 1520 force_iommu = 1;
1469 bad_dma_address = 0x0; 1521 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1522 /* dma_ops is set to swiotlb or nommu */
1523 if (!dma_ops)
1524 dma_ops = &nommu_dma_ops;
1471 1525
1472 return 0; 1526 return 0;
1473} 1527}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8467ec2320f1..37544123896d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,14 +5,13 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12int forbid_dac __read_mostly; 12static int forbid_dac __read_mostly;
13EXPORT_SYMBOL(forbid_dac);
14 13
15const struct dma_mapping_ops *dma_ops; 14struct dma_mapping_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 15EXPORT_SYMBOL(dma_ops);
17 16
18static int iommu_sac_force __read_mostly; 17static int iommu_sac_force __read_mostly;
@@ -114,21 +113,15 @@ void __init pci_iommu_alloc(void)
114 * The order of these functions is important for 113 * The order of these functions is important for
115 * fall-back/fail-over reasons 114 * fall-back/fail-over reasons
116 */ 115 */
117#ifdef CONFIG_GART_IOMMU
118 gart_iommu_hole_init(); 116 gart_iommu_hole_init();
119#endif
120 117
121#ifdef CONFIG_CALGARY_IOMMU
122 detect_calgary(); 118 detect_calgary();
123#endif
124 119
125 detect_intel_iommu(); 120 detect_intel_iommu();
126 121
127 amd_iommu_detect(); 122 amd_iommu_detect();
128 123
129#ifdef CONFIG_SWIOTLB
130 pci_swiotlb_init(); 124 pci_swiotlb_init();
131#endif
132} 125}
133#endif 126#endif
134 127
@@ -184,9 +177,7 @@ static __init int iommu_setup(char *p)
184 swiotlb = 1; 177 swiotlb = 1;
185#endif 178#endif
186 179
187#ifdef CONFIG_GART_IOMMU
188 gart_parse_options(p); 180 gart_parse_options(p);
189#endif
190 181
191#ifdef CONFIG_CALGARY_IOMMU 182#ifdef CONFIG_CALGARY_IOMMU
192 if (!strncmp(p, "calgary", 7)) 183 if (!strncmp(p, "calgary", 7))
@@ -321,16 +312,17 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr)
321 312
322int dma_supported(struct device *dev, u64 mask) 313int dma_supported(struct device *dev, u64 mask)
323{ 314{
315 struct dma_mapping_ops *ops = get_dma_ops(dev);
316
324#ifdef CONFIG_PCI 317#ifdef CONFIG_PCI
325 if (mask > 0xffffffff && forbid_dac > 0) { 318 if (mask > 0xffffffff && forbid_dac > 0) {
326 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 319 dev_info(dev, "PCI: Disallowing DAC for device\n");
327 dev->bus_id);
328 return 0; 320 return 0;
329 } 321 }
330#endif 322#endif
331 323
332 if (dma_ops->dma_supported) 324 if (ops->dma_supported)
333 return dma_ops->dma_supported(dev, mask); 325 return ops->dma_supported(dev, mask);
334 326
335 /* Copied from i386. Doesn't make much sense, because it will 327 /* Copied from i386. Doesn't make much sense, because it will
336 only work for pci_alloc_coherent. 328 only work for pci_alloc_coherent.
@@ -351,8 +343,7 @@ int dma_supported(struct device *dev, u64 mask)
351 type. Normally this doesn't make any difference, but gives 343 type. Normally this doesn't make any difference, but gives
352 more gentle handling of IOMMU overflow. */ 344 more gentle handling of IOMMU overflow. */
353 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 345 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
354 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 346 dev_info(dev, "Force SAC with mask %Lx\n", mask);
355 dev->bus_id, mask);
356 return 0; 347 return 0;
357 } 348 }
358 349
@@ -378,6 +369,7 @@ void *
378dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, 369dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
379 gfp_t gfp) 370 gfp_t gfp)
380{ 371{
372 struct dma_mapping_ops *ops = get_dma_ops(dev);
381 void *memory = NULL; 373 void *memory = NULL;
382 struct page *page; 374 struct page *page;
383 unsigned long dma_mask = 0; 375 unsigned long dma_mask = 0;
@@ -446,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
446 /* Let low level make its own zone decisions */ 438 /* Let low level make its own zone decisions */
447 gfp &= ~(GFP_DMA32|GFP_DMA); 439 gfp &= ~(GFP_DMA32|GFP_DMA);
448 440
449 if (dma_ops->alloc_coherent) 441 if (ops->alloc_coherent)
450 return dma_ops->alloc_coherent(dev, size, 442 return ops->alloc_coherent(dev, size,
451 dma_handle, gfp); 443 dma_handle, gfp);
452 return NULL; 444 return NULL;
453 } 445 }
@@ -459,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
459 } 451 }
460 } 452 }
461 453
462 if (dma_ops->alloc_coherent) { 454 if (ops->alloc_coherent) {
463 free_pages((unsigned long)memory, get_order(size)); 455 free_pages((unsigned long)memory, get_order(size));
464 gfp &= ~(GFP_DMA|GFP_DMA32); 456 gfp &= ~(GFP_DMA|GFP_DMA32);
465 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); 457 return ops->alloc_coherent(dev, size, dma_handle, gfp);
466 } 458 }
467 459
468 if (dma_ops->map_simple) { 460 if (ops->map_simple) {
469 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), 461 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
470 size, 462 size,
471 PCI_DMA_BIDIRECTIONAL); 463 PCI_DMA_BIDIRECTIONAL);
472 if (*dma_handle != bad_dma_address) 464 if (*dma_handle != bad_dma_address)
@@ -488,29 +480,27 @@ EXPORT_SYMBOL(dma_alloc_coherent);
488void dma_free_coherent(struct device *dev, size_t size, 480void dma_free_coherent(struct device *dev, size_t size,
489 void *vaddr, dma_addr_t bus) 481 void *vaddr, dma_addr_t bus)
490{ 482{
483 struct dma_mapping_ops *ops = get_dma_ops(dev);
484
491 int order = get_order(size); 485 int order = get_order(size);
492 WARN_ON(irqs_disabled()); /* for portability */ 486 WARN_ON(irqs_disabled()); /* for portability */
493 if (dma_release_coherent(dev, order, vaddr)) 487 if (dma_release_coherent(dev, order, vaddr))
494 return; 488 return;
495 if (dma_ops->unmap_single) 489 if (ops->unmap_single)
496 dma_ops->unmap_single(dev, bus, size, 0); 490 ops->unmap_single(dev, bus, size, 0);
497 free_pages((unsigned long)vaddr, order); 491 free_pages((unsigned long)vaddr, order);
498} 492}
499EXPORT_SYMBOL(dma_free_coherent); 493EXPORT_SYMBOL(dma_free_coherent);
500 494
501static int __init pci_iommu_init(void) 495static int __init pci_iommu_init(void)
502{ 496{
503#ifdef CONFIG_CALGARY_IOMMU
504 calgary_iommu_init(); 497 calgary_iommu_init();
505#endif
506 498
507 intel_iommu_init(); 499 intel_iommu_init();
508 500
509 amd_iommu_init(); 501 amd_iommu_init();
510 502
511#ifdef CONFIG_GART_IOMMU
512 gart_iommu_init(); 503 gart_iommu_init();
513#endif
514 504
515 no_iommu_init(); 505 no_iommu_init();
516 return 0; 506 return 0;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d18..744126e64950 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -32,6 +32,7 @@
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
@@ -197,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
197 * out. Hopefully no network devices use single mappings that big. 198 * out. Hopefully no network devices use single mappings that big.
198 */ 199 */
199 200
200 printk(KERN_ERR 201 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
201 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
202 size, dev->bus_id);
203 202
204 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 203 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
205 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 204 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -693,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
693 692
694extern int agp_amd64_init(void); 693extern int agp_amd64_init(void);
695 694
696static const struct dma_mapping_ops gart_dma_ops = { 695static struct dma_mapping_ops gart_dma_ops = {
697 .mapping_error = NULL,
698 .map_single = gart_map_single, 696 .map_single = gart_map_single,
699 .map_simple = gart_map_simple, 697 .map_simple = gart_map_simple,
700 .unmap_single = gart_unmap_single, 698 .unmap_single = gart_unmap_single,
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..3f91f71cdc3e 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,7 +7,7 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75struct dma_mapping_ops nommu_dma_ops = {
76static int nommu_mapping_error(dma_addr_t dma_addr)
77{
78#ifdef CONFIG_X86_32
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83}
84
85
86const struct dma_mapping_ops nommu_dma_ops = {
87 .map_single = nommu_map_single, 76 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 77 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 78 .is_phys = 1,
91}; 79};
92 80
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04d..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f8..7fc4d5b0a6a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,6 +15,7 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
18 19
19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
20{ 21{
@@ -199,6 +200,7 @@ static void poll_idle(void)
199 * 200 *
200 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
201 */ 202 */
203static int __cpuinitdata force_mwait;
202 204
203#define MWAIT_INFO 0x05 205#define MWAIT_INFO 0x05
204#define MWAIT_ECX_EXTENDED_INFO 0x01 206#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -326,6 +328,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
326 328
327static int __init idle_setup(char *str) 329static int __init idle_setup(char *str)
328{ 330{
331 if (!str)
332 return -EINVAL;
333
329 if (!strcmp(str, "poll")) { 334 if (!strcmp(str, "poll")) {
330 printk("using polling idle threads.\n"); 335 printk("using polling idle threads.\n");
331 pm_idle = poll_idle; 336 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0c3927accb00..53bc653ed5ca 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -128,7 +128,7 @@ void cpu_idle(void)
128 128
129 /* endless idle loop with no priority at all */ 129 /* endless idle loop with no priority at all */
130 while (1) { 130 while (1) {
131 tick_nohz_stop_sched_tick(); 131 tick_nohz_stop_sched_tick(1);
132 while (!need_resched()) { 132 while (!need_resched()) {
133 133
134 check_pgt_cache(); 134 check_pgt_cache();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..3fb62a7d9a16 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -120,7 +120,7 @@ void cpu_idle(void)
120 current_thread_info()->status |= TS_POLLING; 120 current_thread_info()->status |= TS_POLLING;
121 /* endless idle loop with no priority at all */ 121 /* endless idle loop with no priority at all */
122 while (1) { 122 while (1) {
123 tick_nohz_stop_sched_tick(); 123 tick_nohz_stop_sched_tick(1);
124 while (!need_resched()) { 124 while (!need_resched()) {
125 125
126 rmb(); 126 rmb();
@@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 537struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 538__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 539{
540 struct thread_struct *prev = &prev_p->thread, 540 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 541 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 542 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 543 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 544 unsigned fsindex, gsindex;
@@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586 586
587 /* 587 /*
588 * Switch FS and GS. 588 * Switch FS and GS.
589 *
590 * Segment register != 0 always requires a reload. Also
591 * reload when it has changed. When prev process used 64bit
592 * base always reload to avoid an information leak.
589 */ 593 */
590 { 594 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 595 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 596 /*
593 when prev process used 64bit base always reload 597 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 598 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 599 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 600 */
597 /* check if the user used a selector != 0 601 if (fsindex)
598 * if yes clear 64bit base, since overloaded base
599 * is always mapped to the Null selector
600 */
601 if (fsindex)
602 prev->fs = 0; 602 prev->fs = 0;
603 } 603 }
604 /* when next process has a 64bit base use it */ 604 /* when next process has a 64bit base use it */
605 if (next->fs) 605 if (next->fs)
606 wrmsrl(MSR_FS_BASE, next->fs); 606 wrmsrl(MSR_FS_BASE, next->fs);
607 prev->fsindex = fsindex; 607 prev->fsindex = fsindex;
608 608
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 610 load_gs_index(next->gsindex);
611 if (gsindex) 611 if (gsindex)
612 prev->gs = 0; 612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
618 617
619 /* Must be after DS reload */ 618 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 619 unlazy_fpu(prev_p);
@@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
627 write_pda(pcurrent, next_p); 626 write_pda(pcurrent, next_p);
628 627
629 write_pda(kernelstack, 628 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 629 (unsigned long)task_stack_page(next_p) +
630 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 632 write_pda(stack_canary, next_p->stack_canary);
633 /* 633 /*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e1..e37dccce85db 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1357#endif
1358} 1358}
1359 1359
1360#ifdef CONFIG_X86_32
1361
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1361{
1364 struct siginfo info; 1362 struct siginfo info;
@@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1375 force_sig_info(SIGTRAP, &info, tsk);
1378} 1376}
1379 1377
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435
1436 /*
1437 * this isn't the same as continuing with a signal, but it will do
1438 * for normal use. strace only continues with a signal if the
1439 * stopping signal is not SIGTRAP. -brl
1440 */
1441 if (current->exit_code) {
1442 send_sig(current->exit_code, current, 1);
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460
1461static void syscall_trace(struct pt_regs *regs) 1378static void syscall_trace(struct pt_regs *regs)
1462{ 1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1463 1382
1464#if 0 1383#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", 1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
@@ -1481,39 +1400,81 @@ static void syscall_trace(struct pt_regs *regs)
1481 } 1400 }
1482} 1401}
1483 1402
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs) 1403#ifdef CONFIG_X86_32
1404# define IS_IA32 1
1405#elif defined CONFIG_IA32_EMULATION
1406# define IS_IA32 test_thread_flag(TIF_IA32)
1407#else
1408# define IS_IA32 0
1409#endif
1410
1411/*
1412 * We must return the syscall number to actually look up in the table.
1413 * This can be -1L to skip running any syscall at all.
1414 */
1415asmregparm long syscall_trace_enter(struct pt_regs *regs)
1485{ 1416{
1417 long ret = 0;
1418
1419 /*
1420 * If we stepped into a sysenter/syscall insn, it trapped in
1421 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1422 * If user-mode had set TF itself, then it's still clear from
1423 * do_debug() and we need to set it again to restore the user
1424 * state. If we entered on the slow path, TF was already set.
1425 */
1426 if (test_thread_flag(TIF_SINGLESTEP))
1427 regs->flags |= X86_EFLAGS_TF;
1428
1486 /* do the secure computing check first */ 1429 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1430 secure_computing(regs->orig_ax);
1488 1431
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1433 ret = -1L;
1434
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE))
1491 syscall_trace(regs); 1436 syscall_trace(regs);
1492 1437
1493 if (unlikely(current->audit_context)) { 1438 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1439 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1440 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1441 regs->orig_ax,
1497 regs->bx, regs->cx, 1442 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1443 regs->dx, regs->si);
1499 } else { 1444#ifdef CONFIG_X86_64
1445 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1446 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1447 regs->orig_ax,
1502 regs->di, regs->si, 1448 regs->di, regs->si,
1503 regs->dx, regs->r10); 1449 regs->dx, regs->r10);
1504 } 1450#endif
1505 } 1451 }
1452
1453 return ret ?: regs->orig_ax;
1506} 1454}
1507 1455
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1456asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1457{
1510 if (unlikely(current->audit_context)) 1458 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1460
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1461 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP))
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs); 1462 syscall_trace(regs);
1517}
1518 1463
1519#endif /* CONFIG_X86_32 */ 1464 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of
1466 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1467 * We already reported this syscall instruction in
1468 * syscall_trace_enter(), so don't do any more now.
1469 */
1470 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1471 return;
1472
1473 /*
1474 * If we are single-stepping, synthesize a trap to follow the
1475 * system call instruction.
1476 */
1477 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED))
1479 send_sigtrap(current, regs, 0);
1480}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e151..06a9f643817e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 178 },
179 }, 179 },
180 { /* Handle problems with rebooting on Dell T5400's */
181 .callback = set_bios_reboot,
182 .ident = "Dell Precision T5400",
183 .matches = {
184 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
185 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
186 },
187 },
180 { /* Handle problems with rebooting on HP laptops */ 188 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 189 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 190 .ident = "HP Compaq Laptop",
@@ -403,24 +411,28 @@ void native_machine_shutdown(void)
403{ 411{
404 /* Stop the cpus and apics */ 412 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
406 int reboot_cpu_id;
407 414
408 /* The boot cpu is always logical cpu 0 */ 415 /* The boot cpu is always logical cpu 0 */
409 reboot_cpu_id = 0; 416 int reboot_cpu_id = 0;
417 cpumask_of_cpu_ptr(newmask, reboot_cpu_id);
410 418
411#ifdef CONFIG_X86_32 419#ifdef CONFIG_X86_32
412 /* See if there has been given a command line override */ 420 /* See if there has been given a command line override */
413 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 421 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
414 cpu_online(reboot_cpu)) 422 cpu_online(reboot_cpu)) {
415 reboot_cpu_id = reboot_cpu; 423 reboot_cpu_id = reboot_cpu;
424 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
425 }
416#endif 426#endif
417 427
418 /* Make certain the cpu I'm about to reboot on is online */ 428 /* Make certain the cpu I'm about to reboot on is online */
419 if (!cpu_online(reboot_cpu_id)) 429 if (!cpu_online(reboot_cpu_id)) {
420 reboot_cpu_id = smp_processor_id(); 430 reboot_cpu_id = smp_processor_id();
431 cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id);
432 }
421 433
422 /* Make certain I only run on the appropriate processor */ 434 /* Make certain I only run on the appropriate processor */
423 set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); 435 set_cpus_allowed_ptr(current, newmask);
424 436
425 /* O.K Now that I'm on the appropriate processor, 437 /* O.K Now that I'm on the appropriate processor,
426 * stop all of the others. 438 * stop all of the others.
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..703310a99023 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,44 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are
24 * used to save some data for jumping back
25 */
26#define DATA(offset) (PAGE_SIZE/2+(offset))
27
28/* Minimal CPU state */
29#define ESP DATA(0x0)
30#define CR0 DATA(0x4)
31#define CR3 DATA(0x8)
32#define CR4 DATA(0xc)
33
34/* other data */
35#define CP_VA_CONTROL_PAGE DATA(0x10)
36#define CP_PA_PGD DATA(0x14)
37#define CP_PA_SWAP_PAGE DATA(0x18)
38#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
39
23 .text 40 .text
24 .align PAGE_SIZE 41 .align PAGE_SIZE
25 .globl relocate_kernel 42 .globl relocate_kernel
26relocate_kernel: 43relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 44 /* Save the CPU context, used for jumping back */
45
46 pushl %ebx
47 pushl %esi
48 pushl %edi
49 pushl %ebp
50 pushf
51
52 movl 20+8(%esp), %ebp /* list of pages */
53 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
54 movl %esp, ESP(%edi)
55 movl %cr0, %eax
56 movl %eax, CR0(%edi)
57 movl %cr3, %eax
58 movl %eax, CR3(%edi)
59 movl %cr4, %eax
60 movl %eax, CR4(%edi)
28 61
29#ifdef CONFIG_X86_PAE 62#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 63 /* map the control page at its virtual address */
@@ -138,15 +171,25 @@ relocate_kernel:
138 171
139relocate_new_kernel: 172relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 173 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 174 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 175 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 176 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 177 movl 20+16(%esp), %ecx /* cpu_has_pae */
178 movl 20+20(%esp), %esi /* preserve_context */
145 179
146 /* zero out flags, and disable interrupts */ 180 /* zero out flags, and disable interrupts */
147 pushl $0 181 pushl $0
148 popfl 182 popfl
149 183
184 /* save some information for jumping back */
185 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
186 movl %edi, CP_VA_CONTROL_PAGE(%edi)
187 movl PTR(PA_PGD)(%ebp), %eax
188 movl %eax, CP_PA_PGD(%edi)
189 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
190 movl %eax, CP_PA_SWAP_PAGE(%edi)
191 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
192
150 /* get physical address of control page now */ 193 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 194 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 195 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +240,90 @@ identity_mapped:
197 xorl %eax, %eax 240 xorl %eax, %eax
198 movl %eax, %cr3 241 movl %eax, %cr3
199 242
243 movl CP_PA_SWAP_PAGE(%edi), %eax
244 pushl %eax
245 pushl %ebx
246 call swap_pages
247 addl $8, %esp
248
249 /* To be certain of avoiding problems with self-modifying code
250 * I need to execute a serializing instruction here.
251 * So I flush the TLB, it's handy, and not processor dependent.
252 */
253 xorl %eax, %eax
254 movl %eax, %cr3
255
256 /* set all of the registers to known values */
257 /* leave %esp alone */
258
259 testl %esi, %esi
260 jnz 1f
261 xorl %edi, %edi
262 xorl %eax, %eax
263 xorl %ebx, %ebx
264 xorl %ecx, %ecx
265 xorl %edx, %edx
266 xorl %esi, %esi
267 xorl %ebp, %ebp
268 ret
2691:
270 popl %edx
271 movl CP_PA_SWAP_PAGE(%edi), %esp
272 addl $PAGE_SIZE, %esp
2732:
274 call *%edx
275
276 /* get the re-entry point of the peer system */
277 movl 0(%esp), %ebp
278 call 1f
2791:
280 popl %ebx
281 subl $(1b - relocate_kernel), %ebx
282 movl CP_VA_CONTROL_PAGE(%ebx), %edi
283 lea PAGE_SIZE(%ebx), %esp
284 movl CP_PA_SWAP_PAGE(%ebx), %eax
285 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
286 pushl %eax
287 pushl %edx
288 call swap_pages
289 addl $8, %esp
290 movl CP_PA_PGD(%ebx), %eax
291 movl %eax, %cr3
292 movl %cr0, %eax
293 orl $(1<<31), %eax
294 movl %eax, %cr0
295 lea PAGE_SIZE(%edi), %esp
296 movl %edi, %eax
297 addl $(virtual_mapped - relocate_kernel), %eax
298 pushl %eax
299 ret
300
301virtual_mapped:
302 movl CR4(%edi), %eax
303 movl %eax, %cr4
304 movl CR3(%edi), %eax
305 movl %eax, %cr3
306 movl CR0(%edi), %eax
307 movl %eax, %cr0
308 movl ESP(%edi), %esp
309 movl %ebp, %eax
310
311 popf
312 popl %ebp
313 popl %edi
314 popl %esi
315 popl %ebx
316 ret
317
200 /* Do the copies */ 318 /* Do the copies */
201 movl %ebx, %ecx 319swap_pages:
320 movl 8(%esp), %edx
321 movl 4(%esp), %ecx
322 pushl %ebp
323 pushl %ebx
324 pushl %edi
325 pushl %esi
326 movl %ecx, %ebx
202 jmp 1f 327 jmp 1f
203 328
2040: /* top, read another word from the indirection page */ 3290: /* top, read another word from the indirection page */
@@ -226,27 +351,28 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 351 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 352 andl $0xfffff000, %esi
228 353
354 movl %edi, %eax
355 movl %esi, %ebp
356
357 movl %edx, %edi
229 movl $1024, %ecx 358 movl $1024, %ecx
230 rep ; movsl 359 rep ; movsl
231 jmp 0b
232 360
2333: 361 movl %ebp, %edi
234 362 movl %eax, %esi
235 /* To be certain of avoiding problems with self-modifying code 363 movl $1024, %ecx
236 * I need to execute a serializing instruction here. 364 rep ; movsl
237 * So I flush the TLB, it's handy, and not processor dependent.
238 */
239 xorl %eax, %eax
240 movl %eax, %cr3
241 365
242 /* set all of the registers to known values */ 366 movl %eax, %edi
243 /* leave %esp alone */ 367 movl %edx, %esi
368 movl $1024, %ecx
369 rep ; movsl
244 370
245 xorl %eax, %eax 371 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 372 jmp 0b
247 xorl %ecx, %ecx 3733:
248 xorl %edx, %edx 374 popl %esi
249 xorl %esi, %esi 375 popl %edi
250 xorl %edi, %edi 376 popl %ebx
251 xorl %ebp, %ebp 377 popl %ebp
252 ret 378 ret
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..b520dae02bf4 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/user.h> 58#include <linux/user.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/highmem.h>
61 60
62#include <linux/kallsyms.h> 61#include <linux/kallsyms.h>
63#include <linux/edd.h>
64#include <linux/iscsi_ibft.h>
65#include <linux/kexec.h>
66#include <linux/cpufreq.h> 62#include <linux/cpufreq.h>
67#include <linux/dma-mapping.h> 63#include <linux/dma-mapping.h>
68#include <linux/ctype.h> 64#include <linux/ctype.h>
@@ -96,7 +92,7 @@
96#include <asm/smp.h> 92#include <asm/smp.h>
97#include <asm/desc.h> 93#include <asm/desc.h>
98#include <asm/dma.h> 94#include <asm/dma.h>
99#include <asm/gart.h> 95#include <asm/iommu.h>
100#include <asm/mmu_context.h> 96#include <asm/mmu_context.h>
101#include <asm/proto.h> 97#include <asm/proto.h>
102 98
@@ -104,7 +100,6 @@
104#include <asm/paravirt.h> 100#include <asm/paravirt.h>
105 101
106#include <asm/percpu.h> 102#include <asm/percpu.h>
107#include <asm/sections.h>
108#include <asm/topology.h> 103#include <asm/topology.h>
109#include <asm/apicdef.h> 104#include <asm/apicdef.h>
110#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -579,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg)
579early_param("elfcorehdr", setup_elfcorehdr); 574early_param("elfcorehdr", setup_elfcorehdr);
580#endif 575#endif
581 576
577static struct x86_quirks default_x86_quirks __initdata;
578
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580
582/* 581/*
583 * Determine if we were loaded by an EFI loader. If so, then we have also been 582 * Determine if we were loaded by an EFI loader. If so, then we have also been
584 * passed the efi memmap, systab, etc., so we should use these data structures 583 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -598,11 +597,11 @@ void __init setup_arch(char **cmdline_p)
598 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 597 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
599 visws_early_detect(); 598 visws_early_detect();
600 pre_setup_arch_hook(); 599 pre_setup_arch_hook();
601 early_cpu_init();
602#else 600#else
603 printk(KERN_INFO "Command line: %s\n", boot_command_line); 601 printk(KERN_INFO "Command line: %s\n", boot_command_line);
604#endif 602#endif
605 603
604 early_cpu_init();
606 early_ioremap_init(); 605 early_ioremap_init();
607 606
608 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 607 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -666,9 +665,6 @@ void __init setup_arch(char **cmdline_p)
666 bss_resource.start = virt_to_phys(&__bss_start); 665 bss_resource.start = virt_to_phys(&__bss_start);
667 bss_resource.end = virt_to_phys(&__bss_stop)-1; 666 bss_resource.end = virt_to_phys(&__bss_stop)-1;
668 667
669#ifdef CONFIG_X86_64
670 early_cpu_init();
671#endif
672 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
673 *cmdline_p = command_line; 669 *cmdline_p = command_line;
674 670
@@ -681,7 +677,7 @@ void __init setup_arch(char **cmdline_p)
681#ifdef CONFIG_X86_LOCAL_APIC 677#ifdef CONFIG_X86_LOCAL_APIC
682 disable_apic = 1; 678 disable_apic = 1;
683#endif 679#endif
684 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 680 setup_clear_cpu_cap(X86_FEATURE_APIC);
685 } 681 }
686 682
687#ifdef CONFIG_PCI 683#ifdef CONFIG_PCI
@@ -824,7 +820,10 @@ void __init setup_arch(char **cmdline_p)
824 vmi_init(); 820 vmi_init();
825#endif 821#endif
826 822
823 paravirt_pagetable_setup_start(swapper_pg_dir);
827 paging_init(); 824 paging_init();
825 paravirt_pagetable_setup_done(swapper_pg_dir);
826 paravirt_post_allocator_init();
828 827
829#ifdef CONFIG_X86_64 828#ifdef CONFIG_X86_64
830 map_vsyscall(); 829 map_vsyscall();
@@ -854,14 +853,6 @@ void __init setup_arch(char **cmdline_p)
854 init_cpu_to_node(); 853 init_cpu_to_node();
855#endif 854#endif
856 855
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
865 init_apic_mappings(); 856 init_apic_mappings();
866 ioapic_init_mappings(); 857 ioapic_init_mappings();
867 858
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index cac68430d31f..f7745f94c006 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -227,8 +227,8 @@ static void __init setup_node_to_cpumask_map(void)
227 /* allocate the map */ 227 /* allocate the map */
228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
229 229
230 Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", 230 pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
231 map, nr_node_ids); 231 map, nr_node_ids);
232 232
233 /* node_to_cpumask() will now work */ 233 /* node_to_cpumask() will now work */
234 node_to_cpumask_map = map; 234 node_to_cpumask_map = map;
@@ -248,7 +248,7 @@ void __cpuinit numa_set_node(int cpu, int node)
248 per_cpu(x86_cpu_to_node_map, cpu) = node; 248 per_cpu(x86_cpu_to_node_map, cpu) = node;
249 249
250 else 250 else
251 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); 251 pr_debug("Setting node for non-present cpu %d\n", cpu);
252} 252}
253 253
254void __cpuinit numa_clear_node(int cpu) 254void __cpuinit numa_clear_node(int cpu)
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..6fb5bcdd8933 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 212
213badframe: 213badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 214 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 215 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 216 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 218 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -657,18 +657,9 @@ static void do_signal(struct pt_regs *regs)
657void 657void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 659{
660 /* Pending single-step? */
661 if (thread_info_flags & _TIF_SINGLESTEP) {
662 regs->flags |= X86_EFLAGS_TF;
663 clear_thread_flag(TIF_SINGLESTEP);
664 }
665
666 /* deal with pending signal delivery */ 660 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 662 do_signal(regs);
669 663
670 if (thread_info_flags & _TIF_HRTICK_RESCHED)
671 hrtick_resched();
672
673 clear_thread_flag(TIF_IRET); 664 clear_thread_flag(TIF_IRET);
674} 665}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..b45ef8ddd651 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -53,6 +53,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 53 return do_sigaltstack(uss, uoss, regs->sp);
54} 54}
55 55
56/*
57 * Signal frame handlers.
58 */
59
60static inline int save_i387(struct _fpstate __user *buf)
61{
62 struct task_struct *tsk = current;
63 int err = 0;
64
65 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
66 sizeof(tsk->thread.xstate->fxsave));
67
68 if ((unsigned long)buf % 16)
69 printk("save_i387: bad fpstate %p\n", buf);
70
71 if (!used_math())
72 return 0;
73 clear_used_math(); /* trigger finit */
74 if (task_thread_info(tsk)->status & TS_USEDFPU) {
75 err = save_i387_checking((struct i387_fxsave_struct __user *)
76 buf);
77 if (err)
78 return err;
79 task_thread_info(tsk)->status &= ~TS_USEDFPU;
80 stts();
81 } else {
82 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
83 sizeof(struct i387_fxsave_struct)))
84 return -1;
85 }
86 return 1;
87}
88
89/*
90 * This restores directly out of user space. Exceptions are handled.
91 */
92static inline int restore_i387(struct _fpstate __user *buf)
93{
94 struct task_struct *tsk = current;
95 int err;
96
97 if (!used_math()) {
98 err = init_fpu(tsk);
99 if (err)
100 return err;
101 }
102
103 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
104 clts();
105 task_thread_info(current)->status |= TS_USEDFPU;
106 }
107 return restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
108}
56 109
57/* 110/*
58 * Do a signal return; undo the signal stack. 111 * Do a signal return; undo the signal stack.
@@ -487,12 +540,6 @@ static void do_signal(struct pt_regs *regs)
487void do_notify_resume(struct pt_regs *regs, void *unused, 540void do_notify_resume(struct pt_regs *regs, void *unused,
488 __u32 thread_info_flags) 541 __u32 thread_info_flags)
489{ 542{
490 /* Pending single-step? */
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE 543#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 544 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 545 if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -502,9 +549,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
502 /* deal with pending signal delivery */ 549 /* deal with pending signal delivery */
503 if (thread_info_flags & _TIF_SIGPENDING) 550 if (thread_info_flags & _TIF_SIGPENDING)
504 do_signal(regs); 551 do_signal(regs);
505
506 if (thread_info_flags & _TIF_HRTICK_RESCHED)
507 hrtick_resched();
508} 552}
509 553
510void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 554void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..332512767f4f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -216,7 +216,7 @@ static void __cpuinit smp_callin(void)
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
217 phys_id, cpuid); 217 phys_id, cpuid);
218 } 218 }
219 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 219 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
220 220
221 /* 221 /*
222 * STARTUP IPIs are fragile beasts as they might sometimes 222 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -251,7 +251,7 @@ static void __cpuinit smp_callin(void)
251 * boards) 251 * boards)
252 */ 252 */
253 253
254 Dprintk("CALLIN, before setup_local_APIC().\n"); 254 pr_debug("CALLIN, before setup_local_APIC().\n");
255 smp_callin_clear_local_apic(); 255 smp_callin_clear_local_apic();
256 setup_local_APIC(); 256 setup_local_APIC();
257 end_local_APIC_setup(); 257 end_local_APIC_setup();
@@ -266,7 +266,7 @@ static void __cpuinit smp_callin(void)
266 local_irq_enable(); 266 local_irq_enable();
267 calibrate_delay(); 267 calibrate_delay();
268 local_irq_disable(); 268 local_irq_disable();
269 Dprintk("Stack at about %p\n", &cpuid); 269 pr_debug("Stack at about %p\n", &cpuid);
270 270
271 /* 271 /*
272 * Save our processor parameters 272 * Save our processor parameters
@@ -438,7 +438,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
438 cpu_set(cpu, cpu_sibling_setup_map); 438 cpu_set(cpu, cpu_sibling_setup_map);
439 439
440 if (smp_num_siblings > 1) { 440 if (smp_num_siblings > 1) {
441 for_each_cpu_mask(i, cpu_sibling_setup_map) { 441 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 442 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
443 c->cpu_core_id == cpu_data(i).cpu_core_id) { 443 c->cpu_core_id == cpu_data(i).cpu_core_id) {
444 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 444 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -461,7 +461,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
461 return; 461 return;
462 } 462 }
463 463
464 for_each_cpu_mask(i, cpu_sibling_setup_map) { 464 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
467 cpu_set(i, c->llc_shared_map); 467 cpu_set(i, c->llc_shared_map);
@@ -513,7 +513,7 @@ static void impress_friends(void)
513 /* 513 /*
514 * Allow the user to impress friends. 514 * Allow the user to impress friends.
515 */ 515 */
516 Dprintk("Before bogomips.\n"); 516 pr_debug("Before bogomips.\n");
517 for_each_possible_cpu(cpu) 517 for_each_possible_cpu(cpu)
518 if (cpu_isset(cpu, cpu_callout_map)) 518 if (cpu_isset(cpu, cpu_callout_map))
519 bogosum += cpu_data(cpu).loops_per_jiffy; 519 bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -523,7 +523,7 @@ static void impress_friends(void)
523 bogosum/(500000/HZ), 523 bogosum/(500000/HZ),
524 (bogosum/(5000/HZ))%100); 524 (bogosum/(5000/HZ))%100);
525 525
526 Dprintk("Before bogocount - setting activated=1.\n"); 526 pr_debug("Before bogocount - setting activated=1.\n");
527} 527}
528 528
529static inline void __inquire_remote_apic(int apicid) 529static inline void __inquire_remote_apic(int apicid)
@@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 546 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 547 "a previous APIC delivery may have failed\n");
548 548
549 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 549 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
550 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); 550 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 551
552 timeout = 0; 552 timeout = 0;
553 do { 553 do {
@@ -579,29 +579,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 579 int maxlvt;
580 580
581 /* Target chip */ 581 /* Target chip */
582 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); 582 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583 583
584 /* Boot on the stack */ 584 /* Boot on the stack */
585 /* Kick the second */ 585 /* Kick the second */
586 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 586 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
587 587
588 Dprintk("Waiting for send to finish...\n"); 588 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 589 send_status = safe_apic_wait_icr_idle();
590 590
591 /* 591 /*
592 * Give the other CPU some time to accept the IPI. 592 * Give the other CPU some time to accept the IPI.
593 */ 593 */
594 udelay(200); 594 udelay(200);
595 /*
596 * Due to the Pentium erratum 3AP.
597 */
598 maxlvt = lapic_get_maxlvt(); 595 maxlvt = lapic_get_maxlvt();
599 if (maxlvt > 3) { 596 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
600 apic_read_around(APIC_SPIV);
601 apic_write(APIC_ESR, 0); 597 apic_write(APIC_ESR, 0);
602 }
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 598 accept_status = (apic_read(APIC_ESR) & 0xEF);
604 Dprintk("NMI sent.\n"); 599 pr_debug("NMI sent.\n");
605 600
606 if (send_status) 601 if (send_status)
607 printk(KERN_ERR "APIC never delivered???\n"); 602 printk(KERN_ERR "APIC never delivered???\n");
@@ -625,42 +620,44 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
625 return send_status; 620 return send_status;
626 } 621 }
627 622
623 maxlvt = lapic_get_maxlvt();
624
628 /* 625 /*
629 * Be paranoid about clearing APIC errors. 626 * Be paranoid about clearing APIC errors.
630 */ 627 */
631 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 628 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
632 apic_read_around(APIC_SPIV); 629 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
633 apic_write(APIC_ESR, 0); 630 apic_write(APIC_ESR, 0);
634 apic_read(APIC_ESR); 631 apic_read(APIC_ESR);
635 } 632 }
636 633
637 Dprintk("Asserting INIT.\n"); 634 pr_debug("Asserting INIT.\n");
638 635
639 /* 636 /*
640 * Turn INIT on target chip 637 * Turn INIT on target chip
641 */ 638 */
642 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 639 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643 640
644 /* 641 /*
645 * Send IPI 642 * Send IPI
646 */ 643 */
647 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 644 apic_write(APIC_ICR,
648 | APIC_DM_INIT); 645 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT);
649 646
650 Dprintk("Waiting for send to finish...\n"); 647 pr_debug("Waiting for send to finish...\n");
651 send_status = safe_apic_wait_icr_idle(); 648 send_status = safe_apic_wait_icr_idle();
652 649
653 mdelay(10); 650 mdelay(10);
654 651
655 Dprintk("Deasserting INIT.\n"); 652 pr_debug("Deasserting INIT.\n");
656 653
657 /* Target chip */ 654 /* Target chip */
658 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 655 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
659 656
660 /* Send IPI */ 657 /* Send IPI */
661 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 658 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
662 659
663 Dprintk("Waiting for send to finish...\n"); 660 pr_debug("Waiting for send to finish...\n");
664 send_status = safe_apic_wait_icr_idle(); 661 send_status = safe_apic_wait_icr_idle();
665 662
666 mb(); 663 mb();
@@ -687,55 +684,47 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
687 /* 684 /*
688 * Run STARTUP IPI loop. 685 * Run STARTUP IPI loop.
689 */ 686 */
690 Dprintk("#startup loops: %d.\n", num_starts); 687 pr_debug("#startup loops: %d.\n", num_starts);
691
692 maxlvt = lapic_get_maxlvt();
693 688
694 for (j = 1; j <= num_starts; j++) { 689 for (j = 1; j <= num_starts; j++) {
695 Dprintk("Sending STARTUP #%d.\n", j); 690 pr_debug("Sending STARTUP #%d.\n", j);
696 apic_read_around(APIC_SPIV); 691 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
697 apic_write(APIC_ESR, 0); 692 apic_write(APIC_ESR, 0);
698 apic_read(APIC_ESR); 693 apic_read(APIC_ESR);
699 Dprintk("After apic_write.\n"); 694 pr_debug("After apic_write.\n");
700 695
701 /* 696 /*
702 * STARTUP IPI 697 * STARTUP IPI
703 */ 698 */
704 699
705 /* Target chip */ 700 /* Target chip */
706 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); 701 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707 702
708 /* Boot on the stack */ 703 /* Boot on the stack */
709 /* Kick the second */ 704 /* Kick the second */
710 apic_write_around(APIC_ICR, APIC_DM_STARTUP 705 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12));
711 | (start_eip >> 12));
712 706
713 /* 707 /*
714 * Give the other CPU some time to accept the IPI. 708 * Give the other CPU some time to accept the IPI.
715 */ 709 */
716 udelay(300); 710 udelay(300);
717 711
718 Dprintk("Startup point 1.\n"); 712 pr_debug("Startup point 1.\n");
719 713
720 Dprintk("Waiting for send to finish...\n"); 714 pr_debug("Waiting for send to finish...\n");
721 send_status = safe_apic_wait_icr_idle(); 715 send_status = safe_apic_wait_icr_idle();
722 716
723 /* 717 /*
724 * Give the other CPU some time to accept the IPI. 718 * Give the other CPU some time to accept the IPI.
725 */ 719 */
726 udelay(200); 720 udelay(200);
727 /* 721 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
728 * Due to the Pentium erratum 3AP.
729 */
730 if (maxlvt > 3) {
731 apic_read_around(APIC_SPIV);
732 apic_write(APIC_ESR, 0); 722 apic_write(APIC_ESR, 0);
733 }
734 accept_status = (apic_read(APIC_ESR) & 0xEF); 723 accept_status = (apic_read(APIC_ESR) & 0xEF);
735 if (send_status || accept_status) 724 if (send_status || accept_status)
736 break; 725 break;
737 } 726 }
738 Dprintk("After Startup.\n"); 727 pr_debug("After Startup.\n");
739 728
740 if (send_status) 729 if (send_status)
741 printk(KERN_ERR "APIC never delivered???\n"); 730 printk(KERN_ERR "APIC never delivered???\n");
@@ -768,7 +757,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
768 * 757 *
769 * Must be called after the _cpu_pda pointer table is initialized. 758 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 759 */
771static int __cpuinit get_local_pda(int cpu) 760int __cpuinit get_local_pda(int cpu)
772{ 761{
773 struct x8664_pda *oldpda, *newpda; 762 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 763 unsigned long size = sizeof(struct x8664_pda);
@@ -886,7 +875,7 @@ do_rest:
886 875
887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 876 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
888 877
889 Dprintk("Setting warm reset code and vector.\n"); 878 pr_debug("Setting warm reset code and vector.\n");
890 879
891 store_NMI_vector(&nmi_high, &nmi_low); 880 store_NMI_vector(&nmi_high, &nmi_low);
892 881
@@ -907,9 +896,9 @@ do_rest:
907 /* 896 /*
908 * allow APs to start initializing. 897 * allow APs to start initializing.
909 */ 898 */
910 Dprintk("Before Callout %d.\n", cpu); 899 pr_debug("Before Callout %d.\n", cpu);
911 cpu_set(cpu, cpu_callout_map); 900 cpu_set(cpu, cpu_callout_map);
912 Dprintk("After Callout %d.\n", cpu); 901 pr_debug("After Callout %d.\n", cpu);
913 902
914 /* 903 /*
915 * Wait 5s total for a response 904 * Wait 5s total for a response
@@ -922,10 +911,10 @@ do_rest:
922 911
923 if (cpu_isset(cpu, cpu_callin_map)) { 912 if (cpu_isset(cpu, cpu_callin_map)) {
924 /* number CPUs logically, starting from 1 (BSP is 0) */ 913 /* number CPUs logically, starting from 1 (BSP is 0) */
925 Dprintk("OK.\n"); 914 pr_debug("OK.\n");
926 printk(KERN_INFO "CPU%d: ", cpu); 915 printk(KERN_INFO "CPU%d: ", cpu);
927 print_cpu_info(&cpu_data(cpu)); 916 print_cpu_info(&cpu_data(cpu));
928 Dprintk("CPU has booted.\n"); 917 pr_debug("CPU has booted.\n");
929 } else { 918 } else {
930 boot_error = 1; 919 boot_error = 1;
931 if (*((volatile unsigned char *)trampoline_base) 920 if (*((volatile unsigned char *)trampoline_base)
@@ -970,7 +959,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
970 959
971 WARN_ON(irqs_disabled()); 960 WARN_ON(irqs_disabled());
972 961
973 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 962 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
974 963
975 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 964 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
976 !physid_isset(apicid, phys_cpu_present_map)) { 965 !physid_isset(apicid, phys_cpu_present_map)) {
@@ -982,7 +971,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
982 * Already booted CPU? 971 * Already booted CPU?
983 */ 972 */
984 if (cpu_isset(cpu, cpu_callin_map)) { 973 if (cpu_isset(cpu, cpu_callin_map)) {
985 Dprintk("do_boot_cpu %d Already started\n", cpu); 974 pr_debug("do_boot_cpu %d Already started\n", cpu);
986 return -ENOSYS; 975 return -ENOSYS;
987 } 976 }
988 977
@@ -1009,7 +998,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1009 err = do_boot_cpu(apicid, cpu); 998 err = do_boot_cpu(apicid, cpu);
1010#endif 999#endif
1011 if (err) { 1000 if (err) {
1012 Dprintk("do_boot_cpu failed %d\n", err); 1001 pr_debug("do_boot_cpu failed %d\n", err);
1013 return -EIO; 1002 return -EIO;
1014 } 1003 }
1015 1004
@@ -1213,7 +1202,7 @@ void __init native_smp_prepare_boot_cpu(void)
1213 1202
1214void __init native_smp_cpus_done(unsigned int max_cpus) 1203void __init native_smp_cpus_done(unsigned int max_cpus)
1215{ 1204{
1216 Dprintk("Boot done.\n"); 1205 pr_debug("Boot done.\n");
1217 1206
1218 impress_friends(); 1207 impress_friends();
1219 smp_checks(); 1208 smp_checks();
@@ -1230,7 +1219,7 @@ static void remove_siblinginfo(int cpu)
1230 int sibling; 1219 int sibling;
1231 struct cpuinfo_x86 *c = &cpu_data(cpu); 1220 struct cpuinfo_x86 *c = &cpu_data(cpu);
1232 1221
1233 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { 1222 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1234 cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); 1223 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1235 /*/ 1224 /*/
1236 * last thread sibling in this cpu core going down 1225 * last thread sibling in this cpu core going down
@@ -1239,7 +1228,7 @@ static void remove_siblinginfo(int cpu)
1239 cpu_data(sibling).booted_cores--; 1228 cpu_data(sibling).booted_cores--;
1240 } 1229 }
1241 1230
1242 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) 1231 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1243 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); 1232 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1244 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1233 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1245 cpus_clear(per_cpu(cpu_core_map, cpu)); 1234 cpus_clear(per_cpu(cpu_core_map, cpu));
@@ -1311,7 +1300,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1311 cpu_clear(cpu, cpu_callout_map); 1300 cpu_clear(cpu, cpu_callout_map);
1312 cpu_clear(cpu, cpu_callin_map); 1301 cpu_clear(cpu, cpu_callin_map);
1313 /* was set by cpu_init() */ 1302 /* was set by cpu_init() */
1314 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1303 cpu_clear(cpu, cpu_initialized);
1315 numa_remove_cpu(cpu); 1304 numa_remove_cpu(cpu);
1316} 1305}
1317 1306
@@ -1390,7 +1379,8 @@ static int __init parse_maxcpus(char *arg)
1390{ 1379{
1391 extern unsigned int maxcpus; 1380 extern unsigned int maxcpus;
1392 1381
1393 maxcpus = simple_strtoul(arg, NULL, 0); 1382 if (arg)
1383 maxcpus = simple_strtoul(arg, NULL, 0);
1394 return 0; 1384 return 0;
1395} 1385}
1396early_param("maxcpus", parse_maxcpus); 1386early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b4..ffe3c664afc0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -129,6 +129,7 @@ void __init hpet_time_init(void)
129 */ 129 */
130void __init time_init(void) 130void __init time_init(void)
131{ 131{
132 pre_time_init_hook();
132 tsc_init(); 133 tsc_init();
133 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
134} 135}
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 8a768973c4f0..03df8e45e5a1 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -58,6 +58,7 @@
58#include <asm/nmi.h> 58#include <asm/nmi.h>
59#include <asm/smp.h> 59#include <asm/smp.h>
60#include <asm/io.h> 60#include <asm/io.h>
61#include <asm/traps.h>
61 62
62#include "mach_traps.h" 63#include "mach_traps.h"
63 64
@@ -77,26 +78,6 @@ char ignore_fpu_irq;
77gate_desc idt_table[256] 78gate_desc idt_table[256]
78 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
79 80
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int panic_on_unrecovered_nmi; 81int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24; 82int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 83static unsigned int code_bytes = 64;
@@ -256,7 +237,7 @@ static const struct stacktrace_ops print_trace_ops = {
256 237
257static void 238static void
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl) 240 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 241{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
262 printk("%s =======================\n", log_lvl); 243 printk("%s =======================\n", log_lvl);
@@ -383,6 +364,54 @@ int is_valid_bugaddr(unsigned long ip)
383 return ud2 == 0x0b0f; 364 return ud2 == 0x0b0f;
384} 365}
385 366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
386int __kprobes __die(const char *str, struct pt_regs *regs, long err) 415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
387{ 416{
388 unsigned short ss; 417 unsigned short ss;
@@ -423,31 +452,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
423 */ 452 */
424void die(const char *str, struct pt_regs *regs, long err) 453void die(const char *str, struct pt_regs *regs, long err)
425{ 454{
426 static struct { 455 unsigned long flags = oops_begin();
427 raw_spinlock_t lock;
428 u32 lock_owner;
429 int lock_owner_depth;
430 } die = {
431 .lock = __RAW_SPIN_LOCK_UNLOCKED,
432 .lock_owner = -1,
433 .lock_owner_depth = 0
434 };
435 unsigned long flags;
436
437 oops_enter();
438
439 if (die.lock_owner != raw_smp_processor_id()) {
440 console_verbose();
441 raw_local_irq_save(flags);
442 __raw_spin_lock(&die.lock);
443 die.lock_owner = smp_processor_id();
444 die.lock_owner_depth = 0;
445 bust_spinlocks(1);
446 } else {
447 raw_local_irq_save(flags);
448 }
449 456
450 if (++die.lock_owner_depth < 3) { 457 if (die_nest_count < 3) {
451 report_bug(regs->ip, regs); 458 report_bug(regs->ip, regs);
452 459
453 if (__die(str, regs, err)) 460 if (__die(str, regs, err))
@@ -456,26 +463,7 @@ void die(const char *str, struct pt_regs *regs, long err)
456 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
457 } 464 }
458 465
459 bust_spinlocks(0); 466 oops_end(flags, regs, SIGSEGV);
460 die.lock_owner = -1;
461 add_taint(TAINT_DIE);
462 __raw_spin_unlock(&die.lock);
463 raw_local_irq_restore(flags);
464
465 if (!regs)
466 return;
467
468 if (kexec_should_crash(current))
469 crash_kexec(regs);
470
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473
474 if (panic_on_oops)
475 panic("Fatal exception");
476
477 oops_exit();
478 do_exit(SIGSEGV);
479} 467}
480 468
481static inline void 469static inline void
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 2696a6837782..3f18d73f420c 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -51,30 +51,10 @@
51#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
52#include <asm/proto.h> 52#include <asm/proto.h>
53#include <asm/pda.h> 53#include <asm/pda.h>
54#include <asm/traps.h>
54 55
55#include <mach_traps.h> 56#include <mach_traps.h>
56 57
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void);
75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77
78int panic_on_unrecovered_nmi; 58int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12; 59int kstack_depth_to_print = 12;
80static unsigned int code_bytes = 64; 60static unsigned int code_bytes = 64;
@@ -355,17 +335,24 @@ static const struct stacktrace_ops print_trace_ops = {
355 .address = print_trace_address, 335 .address = print_trace_address,
356}; 336};
357 337
358void show_trace(struct task_struct *task, struct pt_regs *regs, 338static void
359 unsigned long *stack, unsigned long bp) 339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl)
360{ 341{
361 printk("\nCall Trace:\n"); 342 printk("\nCall Trace:\n");
362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); 343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
363 printk("\n"); 344 printk("\n");
364} 345}
365 346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
366static void 353static void
367_show_stack(struct task_struct *task, struct pt_regs *regs, 354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
368 unsigned long *sp, unsigned long bp) 355 unsigned long *sp, unsigned long bp, char *log_lvl)
369{ 356{
370 unsigned long *stack; 357 unsigned long *stack;
371 int i; 358 int i;
@@ -399,12 +386,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs,
399 printk(" %016lx", *stack++); 386 printk(" %016lx", *stack++);
400 touch_nmi_watchdog(); 387 touch_nmi_watchdog();
401 } 388 }
402 show_trace(task, regs, sp, bp); 389 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
403} 390}
404 391
405void show_stack(struct task_struct *task, unsigned long *sp) 392void show_stack(struct task_struct *task, unsigned long *sp)
406{ 393{
407 _show_stack(task, NULL, sp, 0); 394 show_stack_log_lvl(task, NULL, sp, 0, "");
408} 395}
409 396
410/* 397/*
@@ -454,7 +441,8 @@ void show_registers(struct pt_regs *regs)
454 u8 *ip; 441 u8 *ip;
455 442
456 printk("Stack: "); 443 printk("Stack: ");
457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); 444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, "");
458 printk("\n"); 446 printk("\n");
459 447
460 printk(KERN_EMERG "Code: "); 448 printk(KERN_EMERG "Code: ");
@@ -518,7 +506,7 @@ unsigned __kprobes long oops_begin(void)
518} 506}
519 507
520void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 508void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
521{ 509{
522 die_owner = -1; 510 die_owner = -1;
523 bust_spinlocks(0); 511 bust_spinlocks(0);
524 die_nest_count--; 512 die_nest_count--;
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1d..41e01b145c48 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -73,7 +73,7 @@ int is_visws_box(void)
73 return visws_board_type >= 0; 73 return visws_board_type >= 0;
74} 74}
75 75
76static int __init visws_time_init_quirk(void) 76static int __init visws_time_init(void)
77{ 77{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 78 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79 79
@@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void)
93 return 0; 93 return 0;
94} 94}
95 95
96static int __init visws_pre_intr_init_quirk(void) 96static int __init visws_pre_intr_init(void)
97{ 97{
98 init_VISWS_APIC_irqs(); 98 init_VISWS_APIC_irqs();
99 99
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
114 114
115long long mem_size __initdata = 0; 115long long mem_size __initdata = 0;
116 116
117static char * __init visws_memory_setup_quirk(void) 117static char * __init visws_memory_setup(void)
118{ 118{
119 long long gfx_mem_size = 8 * MB; 119 long long gfx_mem_size = 8 * MB;
120 120
@@ -176,7 +176,7 @@ static void visws_machine_power_off(void)
176 outl(PIIX_SPECIAL_STOP, 0xCFC); 176 outl(PIIX_SPECIAL_STOP, 0xCFC);
177} 177}
178 178
179static int __init visws_get_smp_config_quirk(unsigned int early) 179static int __init visws_get_smp_config(unsigned int early)
180{ 180{
181 /* 181 /*
182 * Prevent MP-table parsing by the generic code: 182 * Prevent MP-table parsing by the generic code:
@@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus;
192 * No problem for Linux. 192 * No problem for Linux.
193 */ 193 */
194 194
195static void __init MP_processor_info (struct mpc_config_processor *m) 195static void __init MP_processor_info(struct mpc_config_processor *m)
196{ 196{
197 int ver, logical_apicid; 197 int ver, logical_apicid;
198 physid_mask_t apic_cpus; 198 physid_mask_t apic_cpus;
@@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
232 apic_version[m->mpc_apicid] = ver; 232 apic_version[m->mpc_apicid] = ver;
233} 233}
234 234
235int __init visws_find_smp_config_quirk(unsigned int reserve) 235static int __init visws_find_smp_config(unsigned int reserve)
236{ 236{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
258 return 1; 258 return 1;
259} 259}
260 260
261extern int visws_trap_init_quirk(void); 261static int visws_trap_init(void);
262
263static struct x86_quirks visws_x86_quirks __initdata = {
264 .arch_time_init = visws_time_init,
265 .arch_pre_intr_init = visws_pre_intr_init,
266 .arch_memory_setup = visws_memory_setup,
267 .arch_intr_init = NULL,
268 .arch_trap_init = visws_trap_init,
269 .mach_get_smp_config = visws_get_smp_config,
270 .mach_find_smp_config = visws_find_smp_config,
271};
262 272
263void __init visws_early_detect(void) 273void __init visws_early_detect(void)
264{ 274{
@@ -272,16 +282,10 @@ void __init visws_early_detect(void)
272 282
273 /* 283 /*
274 * Install special quirks for timer, interrupt and memory setup: 284 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps: 285 * Fall back to generic behavior for traps:
286 * Override generic MP-table parsing:
282 */ 287 */
283 arch_intr_init_quirk = NULL; 288 x86_quirks = &visws_x86_quirks;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285 289
286 /* 290 /*
287 * Install reboot quirks: 291 * Install reboot quirks:
@@ -294,12 +298,6 @@ void __init visws_early_detect(void)
294 */ 298 */
295 no_broadcast = 0; 299 no_broadcast = 0;
296 300
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC 301#ifdef CONFIG_X86_IO_APIC
304 /* 302 /*
305 * Turn off IO-APIC detection and initialization: 303 * Turn off IO-APIC detection and initialization:
@@ -426,7 +424,7 @@ static __init void cobalt_init(void)
426 co_apic_read(CO_APIC_ID)); 424 co_apic_read(CO_APIC_ID));
427} 425}
428 426
429int __init visws_trap_init_quirk(void) 427static int __init visws_trap_init(void)
430{ 428{
431 lithium_init(); 429 lithium_init();
432 cobalt_init(); 430 cobalt_init();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7b..0a1b1a9d922d 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -906,7 +906,6 @@ static inline int __init activate_vmi(void)
906#ifdef CONFIG_X86_LOCAL_APIC 906#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 907 para_fill(pv_apic_ops.apic_read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 908 para_fill(pv_apic_ops.apic_write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 909#endif
911 910
912 /* 911 /*