aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/boot.c33
-rw-r--r--arch/x86/kernel/acpi/sleep.c4
-rw-r--r--arch/x86/kernel/alternative.c44
-rw-r--r--arch/x86/kernel/amd_iommu.c369
-rw-r--r--arch/x86/kernel/amd_iommu_init.c216
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic_32.c8
-rw-r--r--arch/x86/kernel/apic_64.c7
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c10
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c17
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/centaur.c11
-rw-r--r--arch/x86/kernel/cpu/common.c18
-rw-r--r--arch/x86/kernel/cpu/common_64.c87
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c109
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c50
-rw-r--r--arch/x86/kernel/cpu/feature_names.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c18
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c15
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c279
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c94
-rw-r--r--arch/x86/kernel/cpuid.c16
-rw-r--r--arch/x86/kernel/crash_dump_64.c13
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/efi_32.c4
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c10
-rw-r--r--arch/x86/kernel/head64.c6
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c43
-rw-r--r--arch/x86/kernel/io_delay.c8
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/machine_kexec_32.c20
-rw-r--r--arch/x86/kernel/mfgpt_32.c52
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/mpparse.c6
-rw-r--r--arch/x86/kernel/msr.c40
-rw-r--r--arch/x86/kernel/nmi.c39
-rw-r--r--arch/x86/kernel/numaq_32.c2
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c34
-rw-r--r--arch/x86/kernel/pci-dma.c179
-rw-r--r--arch/x86/kernel/pci-gart_64.c162
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c20
-rw-r--r--arch/x86/kernel/process_32.c67
-rw-r--r--arch/x86/kernel/process_64.c176
-rw-r--r--arch/x86/kernel/ptrace.c478
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S10
-rw-r--r--arch/x86/kernel/setup.c39
-rw-r--r--arch/x86/kernel/sigframe.h5
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c111
-rw-r--r--arch/x86/kernel/smpboot.c71
-rw-r--r--arch/x86/kernel/smpcommon.c17
-rw-r--r--arch/x86/kernel/sys_x86_64.c43
-rw-r--r--arch/x86/kernel/tlb_uv.c3
-rw-r--r--arch/x86/kernel/traps_64.c66
-rw-r--r--arch/x86/kernel/tsc.c424
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/visws_quirks.c22
-rw-r--r--arch/x86/kernel/vmi_32.c12
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S8
-rw-r--r--arch/x86/kernel/vsmp_64.c2
83 files changed, 3204 insertions, 1573 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index d9770a56511a..c2ac1b4515a0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
58#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
59 59
60#include <asm/proto.h> 60#include <asm/proto.h>
61#include <asm/genapic.h>
62 61
63#else /* X86 */ 62#else /* X86 */
64 63
@@ -158,6 +157,16 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
158struct acpi_mcfg_allocation *pci_mmcfg_config; 157struct acpi_mcfg_allocation *pci_mmcfg_config;
159int pci_mmcfg_config_num; 158int pci_mmcfg_config_num;
160 159
160static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
161
162static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
163{
164 if (!strcmp(mcfg->header.oem_id, "SGI"))
165 acpi_mcfg_64bit_base_addr = TRUE;
166
167 return 0;
168}
169
161int __init acpi_parse_mcfg(struct acpi_table_header *header) 170int __init acpi_parse_mcfg(struct acpi_table_header *header)
162{ 171{
163 struct acpi_table_mcfg *mcfg; 172 struct acpi_table_mcfg *mcfg;
@@ -190,8 +199,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
190 } 199 }
191 200
192 memcpy(pci_mmcfg_config, &mcfg[1], config_size); 201 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
202
203 acpi_mcfg_oem_check(mcfg);
204
193 for (i = 0; i < pci_mmcfg_config_num; ++i) { 205 for (i = 0; i < pci_mmcfg_config_num; ++i) {
194 if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { 206 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
207 !acpi_mcfg_64bit_base_addr) {
195 printk(KERN_ERR PREFIX 208 printk(KERN_ERR PREFIX
196 "MMCONFIG not in low 4GB of memory\n"); 209 "MMCONFIG not in low 4GB of memory\n");
197 kfree(pci_mmcfg_config); 210 kfree(pci_mmcfg_config);
@@ -1589,6 +1602,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1589 */ 1602 */
1590 { 1603 {
1591 .callback = dmi_ignore_irq0_timer_override, 1604 .callback = dmi_ignore_irq0_timer_override,
1605 .ident = "HP nx6115 laptop",
1606 .matches = {
1607 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1608 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1609 },
1610 },
1611 {
1612 .callback = dmi_ignore_irq0_timer_override,
1592 .ident = "HP NX6125 laptop", 1613 .ident = "HP NX6125 laptop",
1593 .matches = { 1614 .matches = {
1594 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1615 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1603,6 +1624,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1603 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1624 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1604 }, 1625 },
1605 }, 1626 },
1627 {
1628 .callback = dmi_ignore_irq0_timer_override,
1629 .ident = "HP 6715b laptop",
1630 .matches = {
1631 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1632 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1633 },
1634 },
1606 {} 1635 {}
1607}; 1636};
1608 1637
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index fa2161d5003b..426e5d91b63a 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -20,7 +20,7 @@ unsigned long acpi_realmode_flags;
20/* address in low memory of the wakeup routine. */ 20/* address in low memory of the wakeup routine. */
21static unsigned long acpi_realmode; 21static unsigned long acpi_realmode;
22 22
23#ifdef CONFIG_64BIT 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
24static char temp_stack[10240]; 24static char temp_stack[10240];
25#endif 25#endif
26 26
@@ -86,7 +86,7 @@ int acpi_save_state_mem(void)
86#endif /* !CONFIG_64BIT */ 86#endif /* !CONFIG_64BIT */
87 87
88 header->pmode_cr0 = read_cr0(); 88 header->pmode_cr0 = read_cr0();
89 header->pmode_cr4 = read_cr4(); 89 header->pmode_cr4 = read_cr4_safe();
90 header->realmode_flags = acpi_realmode_flags; 90 header->realmode_flags = acpi_realmode_flags;
91 header->real_magic = 0x12345678; 91 header->real_magic = 0x12345678;
92 92
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2763cb37b553..fb04e49776ba 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146const unsigned char *const *find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_has(X86_FEATURE_NOPL))
150 return p6_nops;
151 else
152 return k8_nops;
150} 153}
151 154
152#else /* CONFIG_X86_64 */ 155#else /* CONFIG_X86_64 */
153 156
154static const struct nop {
155 int cpuid;
156 const unsigned char *const *noptable;
157} noptypes[] = {
158 { X86_FEATURE_K8, k8_nops },
159 { X86_FEATURE_K7, k7_nops },
160 { X86_FEATURE_P4, p6_nops },
161 { X86_FEATURE_P3, p6_nops },
162 { -1, NULL }
163};
164
165const unsigned char *const *find_nop_table(void) 157const unsigned char *const *find_nop_table(void)
166{ 158{
167 const unsigned char *const *noptable = intel_nops; 159 if (boot_cpu_has(X86_FEATURE_K8))
168 int i; 160 return k8_nops;
169 161 else if (boot_cpu_has(X86_FEATURE_K7))
170 for (i = 0; noptypes[i].cpuid >= 0; i++) { 162 return k7_nops;
171 if (boot_cpu_has(noptypes[i].cpuid)) { 163 else if (boot_cpu_has(X86_FEATURE_NOPL))
172 noptable = noptypes[i].noptable; 164 return p6_nops;
173 break; 165 else
174 } 166 return intel_nops;
175 }
176 return noptable;
177} 167}
178 168
179#endif /* CONFIG_X86_64 */ 169#endif /* CONFIG_X86_64 */
@@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
241 continue; 231 continue;
242 if (*ptr > text_end) 232 if (*ptr > text_end)
243 continue; 233 continue;
244 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ 234 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
245 }; 236 };
246} 237}
247 238
248static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
249{ 240{
250 u8 **ptr; 241 u8 **ptr;
251 char insn[1];
252 242
253 if (noreplace_smp) 243 if (noreplace_smp)
254 return; 244 return;
255 245
256 add_nops(insn, 1);
257 for (ptr = start; ptr < end; ptr++) { 246 for (ptr = start; ptr < end; ptr++) {
258 if (*ptr < text) 247 if (*ptr < text)
259 continue; 248 continue;
260 if (*ptr > text_end) 249 if (*ptr > text_end)
261 continue; 250 continue;
262 text_poke(*ptr, insn, 1); 251 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
263 }; 253 };
264} 254}
265 255
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 22d7d050905d..34e4d112b1ef 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
33 33
34static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
35 35
36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
36/* 40/*
37 * general struct to manage commands send to an IOMMU 41 * general struct to manage commands send to an IOMMU
38 */ 42 */
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
51 55
52/**************************************************************************** 56/****************************************************************************
53 * 57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
54 * IOMMU command queuing functions 154 * IOMMU command queuing functions
55 * 155 *
56 ****************************************************************************/ 156 ****************************************************************************/
@@ -65,7 +165,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
65 u8 *target; 165 u8 *target;
66 166
67 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 167 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
68 target = (iommu->cmd_buf + tail); 168 target = iommu->cmd_buf + tail;
69 memcpy_toio(target, cmd, sizeof(*cmd)); 169 memcpy_toio(target, cmd, sizeof(*cmd));
70 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 170 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
71 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 171 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
@@ -101,32 +201,39 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
101 */ 201 */
102static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
103{ 203{
104 int ret; 204 int ret = 0, ready = 0;
205 unsigned status = 0;
105 struct iommu_cmd cmd; 206 struct iommu_cmd cmd;
106 volatile u64 ready = 0; 207 unsigned long flags, i = 0;
107 unsigned long ready_phys = virt_to_phys(&ready);
108 unsigned long i = 0;
109 208
110 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
111 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
112 cmd.data[1] = upper_32_bits(ready_phys);
113 cmd.data[2] = 1; /* value written to 'ready' */
114 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 211 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
115 212
116 iommu->need_sync = 0; 213 iommu->need_sync = 0;
117 214
118 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
119 218
120 if (ret) 219 if (ret)
121 return ret; 220 goto out;
122 221
123 while (!ready && (i < EXIT_LOOP_COUNT)) { 222 while (!ready && (i < EXIT_LOOP_COUNT)) {
124 ++i; 223 ++i;
125 cpu_relax(); 224 /* wait for the bit to become one */
225 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
226 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
126 } 227 }
127 228
229 /* set bit back to zero */
230 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
231 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
232
128 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
129 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
130 237
131 return 0; 238 return 0;
132} 239}
@@ -137,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
137static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
138{ 245{
139 struct iommu_cmd cmd; 246 struct iommu_cmd cmd;
247 int ret;
140 248
141 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
142 250
@@ -144,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
144 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
145 cmd.data[0] = devid; 253 cmd.data[0] = devid;
146 254
255 ret = iommu_queue_command(iommu, &cmd);
256
147 iommu->need_sync = 1; 257 iommu->need_sync = 1;
148 258
149 return iommu_queue_command(iommu, &cmd); 259 return ret;
150} 260}
151 261
152/* 262/*
@@ -156,21 +266,24 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
156 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
157{ 267{
158 struct iommu_cmd cmd; 268 struct iommu_cmd cmd;
269 int ret;
159 270
160 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
161 address &= PAGE_MASK; 272 address &= PAGE_MASK;
162 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 273 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
163 cmd.data[1] |= domid; 274 cmd.data[1] |= domid;
164 cmd.data[2] = LOW_U32(address); 275 cmd.data[2] = lower_32_bits(address);
165 cmd.data[3] = upper_32_bits(address); 276 cmd.data[3] = upper_32_bits(address);
166 if (s) /* size bit - we flush more than one 4kb page */ 277 if (s) /* size bit - we flush more than one 4kb page */
167 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 278 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
168 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
169 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
170 281
282 ret = iommu_queue_command(iommu, &cmd);
283
171 iommu->need_sync = 1; 284 iommu->need_sync = 1;
172 285
173 return iommu_queue_command(iommu, &cmd); 286 return ret;
174} 287}
175 288
176/* 289/*
@@ -200,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
200 return 0; 313 return 0;
201} 314}
202 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
203/**************************************************************************** 324/****************************************************************************
204 * 325 *
205 * The functions below are used the create the page table mappings for 326 * The functions below are used the create the page table mappings for
@@ -359,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
359 * efficient allocator. 480 * efficient allocator.
360 * 481 *
361 ****************************************************************************/ 482 ****************************************************************************/
362static unsigned long dma_mask_to_pages(unsigned long mask)
363{
364 return (mask >> PAGE_SHIFT) +
365 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
366}
367 483
368/* 484/*
369 * The address allocator core function. 485 * The address allocator core function.
@@ -372,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
372 */ 488 */
373static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
374 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
375 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
376{ 494{
377 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
378 unsigned long address; 496 unsigned long address;
379 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
380 unsigned long boundary_size; 497 unsigned long boundary_size;
381 498
382 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
383 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
384 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
385 503
386 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
387 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
388 508
389 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
390 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
391 if (address == -1) 511 if (address == -1) {
392 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
393 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
394 516
395 if (likely(address != -1)) { 517 if (likely(address != -1)) {
396 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -456,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
456 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
457 pages = last_page - start_page; 579 pages = last_page - start_page;
458 580
459 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
460} 582}
461 583
462static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -550,6 +672,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
550 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
551 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
552 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
553 /* Intialize the exclusion range if necessary */ 678 /* Intialize the exclusion range if necessary */
554 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
555 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
@@ -620,12 +745,13 @@ static void set_device_domain(struct amd_iommu *iommu,
620 745
621 u64 pte_root = virt_to_phys(domain->pt_root); 746 u64 pte_root = virt_to_phys(domain->pt_root);
622 747
623 pte_root |= (domain->mode & 0x07) << 9; 748 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
624 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 749 << DEV_ENTRY_MODE_SHIFT;
750 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
625 751
626 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 752 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
627 amd_iommu_dev_table[devid].data[0] = pte_root; 753 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
628 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 754 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
629 amd_iommu_dev_table[devid].data[2] = domain->id; 755 amd_iommu_dev_table[devid].data[2] = domain->id;
630 756
631 amd_iommu_pd_table[devid] = domain; 757 amd_iommu_pd_table[devid] = domain;
@@ -643,6 +769,45 @@ static void set_device_domain(struct amd_iommu *iommu,
643 *****************************************************************************/ 769 *****************************************************************************/
644 770
645/* 771/*
772 * This function checks if the driver got a valid device from the caller to
773 * avoid dereferencing invalid pointers.
774 */
775static bool check_device(struct device *dev)
776{
777 if (!dev || !dev->dma_mask)
778 return false;
779
780 return true;
781}
782
783/*
784 * In this function the list of preallocated protection domains is traversed to
785 * find the domain for a specific device
786 */
787static struct dma_ops_domain *find_protection_domain(u16 devid)
788{
789 struct dma_ops_domain *entry, *ret = NULL;
790 unsigned long flags;
791
792 if (list_empty(&iommu_pd_list))
793 return NULL;
794
795 spin_lock_irqsave(&iommu_pd_list_lock, flags);
796
797 list_for_each_entry(entry, &iommu_pd_list, list) {
798 if (entry->target_dev == devid) {
799 ret = entry;
800 list_del(&ret->list);
801 break;
802 }
803 }
804
805 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
806
807 return ret;
808}
809
810/*
646 * In the dma_ops path we only have the struct device. This function 811 * In the dma_ops path we only have the struct device. This function
647 * finds the corresponding IOMMU, the protection domain and the 812 * finds the corresponding IOMMU, the protection domain and the
648 * requestor id for a given device. 813 * requestor id for a given device.
@@ -658,27 +823,30 @@ static int get_device_resources(struct device *dev,
658 struct pci_dev *pcidev; 823 struct pci_dev *pcidev;
659 u16 _bdf; 824 u16 _bdf;
660 825
661 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 826 *iommu = NULL;
827 *domain = NULL;
828 *bdf = 0xffff;
829
830 if (dev->bus != &pci_bus_type)
831 return 0;
662 832
663 pcidev = to_pci_dev(dev); 833 pcidev = to_pci_dev(dev);
664 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 834 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
665 835
666 /* device not translated by any IOMMU in the system? */ 836 /* device not translated by any IOMMU in the system? */
667 if (_bdf > amd_iommu_last_bdf) { 837 if (_bdf > amd_iommu_last_bdf)
668 *iommu = NULL;
669 *domain = NULL;
670 *bdf = 0xffff;
671 return 0; 838 return 0;
672 }
673 839
674 *bdf = amd_iommu_alias_table[_bdf]; 840 *bdf = amd_iommu_alias_table[_bdf];
675 841
676 *iommu = amd_iommu_rlookup_table[*bdf]; 842 *iommu = amd_iommu_rlookup_table[*bdf];
677 if (*iommu == NULL) 843 if (*iommu == NULL)
678 return 0; 844 return 0;
679 dma_dom = (*iommu)->default_dom;
680 *domain = domain_for_device(*bdf); 845 *domain = domain_for_device(*bdf);
681 if (*domain == NULL) { 846 if (*domain == NULL) {
847 dma_dom = find_protection_domain(*bdf);
848 if (!dma_dom)
849 dma_dom = (*iommu)->default_dom;
682 *domain = &dma_dom->domain; 850 *domain = &dma_dom->domain;
683 set_device_domain(*iommu, *domain, *bdf); 851 set_device_domain(*iommu, *domain, *bdf);
684 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 852 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -757,17 +925,24 @@ static dma_addr_t __map_single(struct device *dev,
757 struct dma_ops_domain *dma_dom, 925 struct dma_ops_domain *dma_dom,
758 phys_addr_t paddr, 926 phys_addr_t paddr,
759 size_t size, 927 size_t size,
760 int dir) 928 int dir,
929 bool align,
930 u64 dma_mask)
761{ 931{
762 dma_addr_t offset = paddr & ~PAGE_MASK; 932 dma_addr_t offset = paddr & ~PAGE_MASK;
763 dma_addr_t address, start; 933 dma_addr_t address, start;
764 unsigned int pages; 934 unsigned int pages;
935 unsigned long align_mask = 0;
765 int i; 936 int i;
766 937
767 pages = iommu_num_pages(paddr, size); 938 pages = iommu_num_pages(paddr, size);
768 paddr &= PAGE_MASK; 939 paddr &= PAGE_MASK;
769 940
770 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 941 if (align)
942 align_mask = (1UL << get_order(size)) - 1;
943
944 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
945 dma_mask);
771 if (unlikely(address == bad_dma_address)) 946 if (unlikely(address == bad_dma_address))
772 goto out; 947 goto out;
773 948
@@ -779,6 +954,12 @@ static dma_addr_t __map_single(struct device *dev,
779 } 954 }
780 address += offset; 955 address += offset;
781 956
957 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
958 iommu_flush_tlb(iommu, dma_dom->domain.id);
959 dma_dom->need_flush = false;
960 } else if (unlikely(iommu_has_npcache(iommu)))
961 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
962
782out: 963out:
783 return address; 964 return address;
784} 965}
@@ -809,6 +990,9 @@ static void __unmap_single(struct amd_iommu *iommu,
809 } 990 }
810 991
811 dma_ops_free_addresses(dma_dom, dma_addr, pages); 992 dma_ops_free_addresses(dma_dom, dma_addr, pages);
993
994 if (amd_iommu_unmap_flush)
995 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
812} 996}
813 997
814/* 998/*
@@ -822,6 +1006,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
822 struct protection_domain *domain; 1006 struct protection_domain *domain;
823 u16 devid; 1007 u16 devid;
824 dma_addr_t addr; 1008 dma_addr_t addr;
1009 u64 dma_mask;
1010
1011 if (!check_device(dev))
1012 return bad_dma_address;
1013
1014 dma_mask = *dev->dma_mask;
825 1015
826 get_device_resources(dev, &iommu, &domain, &devid); 1016 get_device_resources(dev, &iommu, &domain, &devid);
827 1017
@@ -830,14 +1020,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
830 return (dma_addr_t)paddr; 1020 return (dma_addr_t)paddr;
831 1021
832 spin_lock_irqsave(&domain->lock, flags); 1022 spin_lock_irqsave(&domain->lock, flags);
833 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1023 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1024 dma_mask);
834 if (addr == bad_dma_address) 1025 if (addr == bad_dma_address)
835 goto out; 1026 goto out;
836 1027
837 if (iommu_has_npcache(iommu)) 1028 if (unlikely(iommu->need_sync))
838 iommu_flush_pages(iommu, domain->id, addr, size);
839
840 if (iommu->need_sync)
841 iommu_completion_wait(iommu); 1029 iommu_completion_wait(iommu);
842 1030
843out: 1031out:
@@ -857,7 +1045,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
857 struct protection_domain *domain; 1045 struct protection_domain *domain;
858 u16 devid; 1046 u16 devid;
859 1047
860 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1048 if (!check_device(dev) ||
1049 !get_device_resources(dev, &iommu, &domain, &devid))
861 /* device not handled by any AMD IOMMU */ 1050 /* device not handled by any AMD IOMMU */
862 return; 1051 return;
863 1052
@@ -865,9 +1054,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
865 1054
866 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1055 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
867 1056
868 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1057 if (unlikely(iommu->need_sync))
869
870 if (iommu->need_sync)
871 iommu_completion_wait(iommu); 1058 iommu_completion_wait(iommu);
872 1059
873 spin_unlock_irqrestore(&domain->lock, flags); 1060 spin_unlock_irqrestore(&domain->lock, flags);
@@ -906,6 +1093,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
906 struct scatterlist *s; 1093 struct scatterlist *s;
907 phys_addr_t paddr; 1094 phys_addr_t paddr;
908 int mapped_elems = 0; 1095 int mapped_elems = 0;
1096 u64 dma_mask;
1097
1098 if (!check_device(dev))
1099 return 0;
1100
1101 dma_mask = *dev->dma_mask;
909 1102
910 get_device_resources(dev, &iommu, &domain, &devid); 1103 get_device_resources(dev, &iommu, &domain, &devid);
911 1104
@@ -918,19 +1111,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
918 paddr = sg_phys(s); 1111 paddr = sg_phys(s);
919 1112
920 s->dma_address = __map_single(dev, iommu, domain->priv, 1113 s->dma_address = __map_single(dev, iommu, domain->priv,
921 paddr, s->length, dir); 1114 paddr, s->length, dir, false,
1115 dma_mask);
922 1116
923 if (s->dma_address) { 1117 if (s->dma_address) {
924 s->dma_length = s->length; 1118 s->dma_length = s->length;
925 mapped_elems++; 1119 mapped_elems++;
926 } else 1120 } else
927 goto unmap; 1121 goto unmap;
928 if (iommu_has_npcache(iommu))
929 iommu_flush_pages(iommu, domain->id, s->dma_address,
930 s->dma_length);
931 } 1122 }
932 1123
933 if (iommu->need_sync) 1124 if (unlikely(iommu->need_sync))
934 iommu_completion_wait(iommu); 1125 iommu_completion_wait(iommu);
935 1126
936out: 1127out:
@@ -964,7 +1155,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
964 u16 devid; 1155 u16 devid;
965 int i; 1156 int i;
966 1157
967 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1158 if (!check_device(dev) ||
1159 !get_device_resources(dev, &iommu, &domain, &devid))
968 return; 1160 return;
969 1161
970 spin_lock_irqsave(&domain->lock, flags); 1162 spin_lock_irqsave(&domain->lock, flags);
@@ -972,12 +1164,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
972 for_each_sg(sglist, s, nelems, i) { 1164 for_each_sg(sglist, s, nelems, i) {
973 __unmap_single(iommu, domain->priv, s->dma_address, 1165 __unmap_single(iommu, domain->priv, s->dma_address,
974 s->dma_length, dir); 1166 s->dma_length, dir);
975 iommu_flush_pages(iommu, domain->id, s->dma_address,
976 s->dma_length);
977 s->dma_address = s->dma_length = 0; 1167 s->dma_address = s->dma_length = 0;
978 } 1168 }
979 1169
980 if (iommu->need_sync) 1170 if (unlikely(iommu->need_sync))
981 iommu_completion_wait(iommu); 1171 iommu_completion_wait(iommu);
982 1172
983 spin_unlock_irqrestore(&domain->lock, flags); 1173 spin_unlock_irqrestore(&domain->lock, flags);
@@ -995,25 +1185,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
995 struct protection_domain *domain; 1185 struct protection_domain *domain;
996 u16 devid; 1186 u16 devid;
997 phys_addr_t paddr; 1187 phys_addr_t paddr;
1188 u64 dma_mask = dev->coherent_dma_mask;
1189
1190 if (!check_device(dev))
1191 return NULL;
1192
1193 if (!get_device_resources(dev, &iommu, &domain, &devid))
1194 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
998 1195
1196 flag |= __GFP_ZERO;
999 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1197 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1000 if (!virt_addr) 1198 if (!virt_addr)
1001 return 0; 1199 return 0;
1002 1200
1003 memset(virt_addr, 0, size);
1004 paddr = virt_to_phys(virt_addr); 1201 paddr = virt_to_phys(virt_addr);
1005 1202
1006 get_device_resources(dev, &iommu, &domain, &devid);
1007
1008 if (!iommu || !domain) { 1203 if (!iommu || !domain) {
1009 *dma_addr = (dma_addr_t)paddr; 1204 *dma_addr = (dma_addr_t)paddr;
1010 return virt_addr; 1205 return virt_addr;
1011 } 1206 }
1012 1207
1208 if (!dma_mask)
1209 dma_mask = *dev->dma_mask;
1210
1013 spin_lock_irqsave(&domain->lock, flags); 1211 spin_lock_irqsave(&domain->lock, flags);
1014 1212
1015 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1213 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1016 size, DMA_BIDIRECTIONAL); 1214 size, DMA_BIDIRECTIONAL, true, dma_mask);
1017 1215
1018 if (*dma_addr == bad_dma_address) { 1216 if (*dma_addr == bad_dma_address) {
1019 free_pages((unsigned long)virt_addr, get_order(size)); 1217 free_pages((unsigned long)virt_addr, get_order(size));
@@ -1021,10 +1219,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1021 goto out; 1219 goto out;
1022 } 1220 }
1023 1221
1024 if (iommu_has_npcache(iommu)) 1222 if (unlikely(iommu->need_sync))
1025 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
1026
1027 if (iommu->need_sync)
1028 iommu_completion_wait(iommu); 1223 iommu_completion_wait(iommu);
1029 1224
1030out: 1225out:
@@ -1035,8 +1230,6 @@ out:
1035 1230
1036/* 1231/*
1037 * The exported free_coherent function for dma_ops. 1232 * The exported free_coherent function for dma_ops.
1038 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1039 * function.
1040 */ 1233 */
1041static void free_coherent(struct device *dev, size_t size, 1234static void free_coherent(struct device *dev, size_t size,
1042 void *virt_addr, dma_addr_t dma_addr) 1235 void *virt_addr, dma_addr_t dma_addr)
@@ -1046,6 +1239,9 @@ static void free_coherent(struct device *dev, size_t size,
1046 struct protection_domain *domain; 1239 struct protection_domain *domain;
1047 u16 devid; 1240 u16 devid;
1048 1241
1242 if (!check_device(dev))
1243 return;
1244
1049 get_device_resources(dev, &iommu, &domain, &devid); 1245 get_device_resources(dev, &iommu, &domain, &devid);
1050 1246
1051 if (!iommu || !domain) 1247 if (!iommu || !domain)
@@ -1054,9 +1250,8 @@ static void free_coherent(struct device *dev, size_t size,
1054 spin_lock_irqsave(&domain->lock, flags); 1250 spin_lock_irqsave(&domain->lock, flags);
1055 1251
1056 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1252 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1057 iommu_flush_pages(iommu, domain->id, dma_addr, size);
1058 1253
1059 if (iommu->need_sync) 1254 if (unlikely(iommu->need_sync))
1060 iommu_completion_wait(iommu); 1255 iommu_completion_wait(iommu);
1061 1256
1062 spin_unlock_irqrestore(&domain->lock, flags); 1257 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1066,6 +1261,30 @@ free_mem:
1066} 1261}
1067 1262
1068/* 1263/*
1264 * This function is called by the DMA layer to find out if we can handle a
1265 * particular device. It is part of the dma_ops.
1266 */
1267static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1268{
1269 u16 bdf;
1270 struct pci_dev *pcidev;
1271
1272 /* No device or no PCI device */
1273 if (!dev || dev->bus != &pci_bus_type)
1274 return 0;
1275
1276 pcidev = to_pci_dev(dev);
1277
1278 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1279
1280 /* Out of our scope? */
1281 if (bdf > amd_iommu_last_bdf)
1282 return 0;
1283
1284 return 1;
1285}
1286
1287/*
1069 * The function for pre-allocating protection domains. 1288 * The function for pre-allocating protection domains.
1070 * 1289 *
1071 * If the driver core informs the DMA layer if a driver grabs a device 1290 * If the driver core informs the DMA layer if a driver grabs a device
@@ -1094,10 +1313,9 @@ void prealloc_protection_domains(void)
1094 if (!dma_dom) 1313 if (!dma_dom)
1095 continue; 1314 continue;
1096 init_unity_mappings_for_device(dma_dom, devid); 1315 init_unity_mappings_for_device(dma_dom, devid);
1097 set_device_domain(iommu, &dma_dom->domain, devid); 1316 dma_dom->target_dev = devid;
1098 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1317
1099 dma_dom->domain.id); 1318 list_add_tail(&dma_dom->list, &iommu_pd_list);
1100 print_devid(devid, 1);
1101 } 1319 }
1102} 1320}
1103 1321
@@ -1108,6 +1326,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
1108 .unmap_single = unmap_single, 1326 .unmap_single = unmap_single,
1109 .map_sg = map_sg, 1327 .map_sg = map_sg,
1110 .unmap_sg = unmap_sg, 1328 .unmap_sg = unmap_sg,
1329 .dma_supported = amd_iommu_dma_supported,
1111}; 1330};
1112 1331
1113/* 1332/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index d9a9da597e79..148fcfe22f17 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define PCI_BUS(x) (((x) >> 8) & 0xff)
34#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
35 36
36#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
121 we find in ACPI */ 122 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */ 124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
124 126
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */ 128 system */
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
234{ 236{
235 u32 ctrl; 237 u32 ctrl;
236 238
237 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
238 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
240} 242}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242/* Function to enable the hardware */ 244/* Function to enable the hardware */
243void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
244{ 246{
245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
246 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
247 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
248 253
249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
250} 255}
251 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
252/* 264/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in 265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one. 266 * the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
286 ****************************************************************************/ 298 ****************************************************************************/
287 299
288/* 300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
289 * This function reads the last device id the IOMMU has to handle from the PCI 309 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU 310 * capability header for this IOMMU
291 */ 311 */
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
329 default: 349 default:
330 break; 350 break;
331 } 351 }
332 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
333 } 353 }
334 354
335 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
414 434
415static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
416{ 436{
417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); 437 free_pages((unsigned long)iommu->cmd_buf,
438 get_order(iommu->cmd_buf_size));
439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
418} 463}
419 464
420/* sets a specific bit in the device table entry. */ 465/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
487 */ 532 */
488static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
489{ 534{
490 int bus = PCI_BUS(iommu->devid);
491 int dev = PCI_SLOT(iommu->devid);
492 int fn = PCI_FUNC(iommu->devid);
493 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
494 u32 range; 536 u32 range, misc;
495 537
496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
539 &iommu->cap);
540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
541 &range);
542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
497 544
498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
499 iommu->first_device = calc_devid(MMIO_GET_BUS(range), 545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
500 MMIO_GET_FD(range)); 546 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range), 547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range)); 548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
503} 550}
504 551
505/* 552/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
604 break; 651 break;
605 } 652 }
606 653
607 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
608 } 655 }
609} 656}
610 657
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
622static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
623{ 670{
624 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
625 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
626} 674}
627 675
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
649 /* 697 /*
650 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
651 */ 699 */
652 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
653 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
654 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
655 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
656 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -661,10 +713,18 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
661 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
662 return -ENOMEM; 714 return -ENOMEM;
663 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
664 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
665 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
666 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
667 725
726 pci_enable_device(iommu->dev);
727
668 return 0; 728 return 0;
669} 729}
670 730
@@ -706,6 +766,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
706 766
707/**************************************************************************** 767/****************************************************************************
708 * 768 *
769 * The following functions initialize the MSI interrupts for all IOMMUs
770 * in the system. Its a bit challenging because there could be multiple
771 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
772 * pci_dev.
773 *
774 ****************************************************************************/
775
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{
818 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826
827 if (pci_enable_msi(iommu->dev))
828 return 1;
829
830 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
831 IRQF_SAMPLE_RANDOM,
832 "AMD IOMMU",
833 NULL);
834
835 if (r) {
836 pci_disable_msi(iommu->dev);
837 return 1;
838 }
839
840 return 0;
841}
842
843static int __init iommu_init_msi(struct amd_iommu *iommu)
844{
845 if (iommu->int_enabled)
846 return 0;
847
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu);
852
853 return 1;
854}
855
856/****************************************************************************
857 *
709 * The next functions belong to the third pass of parsing the ACPI 858 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are 859 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges). 860 * gathered (like exclusion and unity mapping reanges).
@@ -801,6 +950,20 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
801} 950}
802 951
803/* 952/*
953 * Init the device table to not allow DMA access for devices and
954 * suppress all page faults
955 */
956static void init_device_table(void)
957{
958 u16 devid;
959
960 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
961 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
962 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
963 }
964}
965
966/*
804 * This function finally enables all IOMMUs found in the system after 967 * This function finally enables all IOMMUs found in the system after
805 * they have been initialized 968 * they have been initialized
806 */ 969 */
@@ -810,6 +973,8 @@ static void __init enable_iommus(void)
810 973
811 list_for_each_entry(iommu, &amd_iommu_list, list) { 974 list_for_each_entry(iommu, &amd_iommu_list, list) {
812 iommu_set_exclusion_range(iommu); 975 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
813 iommu_enable(iommu); 978 iommu_enable(iommu);
814 } 979 }
815} 980}
@@ -931,6 +1096,9 @@ int __init amd_iommu_init(void)
931 if (amd_iommu_pd_alloc_bitmap == NULL) 1096 if (amd_iommu_pd_alloc_bitmap == NULL)
932 goto free; 1097 goto free;
933 1098
1099 /* init the device table */
1100 init_device_table();
1101
934 /* 1102 /*
935 * let all alias entries point to itself 1103 * let all alias entries point to itself
936 */ 1104 */
@@ -954,15 +1122,15 @@ int __init amd_iommu_init(void)
954 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1122 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
955 goto free; 1123 goto free;
956 1124
957 ret = amd_iommu_init_dma_ops(); 1125 ret = sysdev_class_register(&amd_iommu_sysdev_class);
958 if (ret) 1126 if (ret)
959 goto free; 1127 goto free;
960 1128
961 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1129 ret = sysdev_register(&device_amd_iommu);
962 if (ret) 1130 if (ret)
963 goto free; 1131 goto free;
964 1132
965 ret = sysdev_register(&device_amd_iommu); 1133 ret = amd_iommu_init_dma_ops();
966 if (ret) 1134 if (ret)
967 goto free; 1135 goto free;
968 1136
@@ -977,11 +1145,17 @@ int __init amd_iommu_init(void)
977 else 1145 else
978 printk("disabled\n"); 1146 printk("disabled\n");
979 1147
1148 if (amd_iommu_unmap_flush)
1149 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1150 else
1151 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1152
980out: 1153out:
981 return ret; 1154 return ret;
982 1155
983free: 1156free:
984 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1157 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1158 get_order(MAX_DOMAIN_ID/8));
985 1159
986 free_pages((unsigned long)amd_iommu_pd_table, 1160 free_pages((unsigned long)amd_iommu_pd_table,
987 get_order(rlookup_table_size)); 1161 get_order(rlookup_table_size));
@@ -1039,8 +1213,10 @@ void __init amd_iommu_detect(void)
1039static int __init parse_amd_iommu_options(char *str) 1213static int __init parse_amd_iommu_options(char *str)
1040{ 1214{
1041 for (; *str; ++str) { 1215 for (; *str; ++str) {
1042 if (strcmp(str, "isolate") == 0) 1216 if (strncmp(str, "isolate", 7) == 0)
1043 amd_iommu_isolate = 1; 1217 amd_iommu_isolate = 1;
1218 if (strncmp(str, "fullflush", 11) == 0)
1219 amd_iommu_unmap_flush = true;
1044 } 1220 }
1045 1221
1046 return 1; 1222 return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db11..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
455 force_iommu || 455 force_iommu ||
456 valid_agp || 456 valid_agp ||
457 fallback_aper_force) { 457 fallback_aper_force) {
458 printk(KERN_ERR 458 printk(KERN_INFO
459 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
460 printk(KERN_ERR 460 printk(KERN_INFO
461 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
462 printk(KERN_ERR 462 printk(KERN_INFO
463 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
464 32 << fallback_aper_order); 464 32 << fallback_aper_order);
465 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index b8d80c291650..a91c57cb666a 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -114,8 +114,6 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
114static int enabled_via_apicbase; 114static int enabled_via_apicbase;
115 115
116static unsigned long apic_phys; 116static unsigned long apic_phys;
117unsigned int __cpuinitdata maxcpus = NR_CPUS;
118
119 117
120/* 118/*
121 * Get the LAPIC version 119 * Get the LAPIC version
@@ -1507,12 +1505,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1507 return; 1505 return;
1508 } 1506 }
1509 1507
1510 if (num_processors >= maxcpus) {
1511 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1512 " Processor ignored.\n", maxcpus);
1513 return;
1514 }
1515
1516 num_processors++; 1508 num_processors++;
1517 cpus_complement(tmp_map, cpu_present_map); 1509 cpus_complement(tmp_map, cpu_present_map);
1518 cpu = first_cpu(tmp_map); 1510 cpu = first_cpu(tmp_map);
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 37e037606f30..53898b65a6ae 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -98,7 +98,6 @@ static struct clock_event_device lapic_clockevent = {
98static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 98static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
99 99
100static unsigned long apic_phys; 100static unsigned long apic_phys;
101unsigned int __cpuinitdata maxcpus = NR_CPUS;
102 101
103unsigned long mp_lapic_addr; 102unsigned long mp_lapic_addr;
104 103
@@ -1444,12 +1443,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1444 return; 1443 return;
1445 } 1444 }
1446 1445
1447 if (num_processors >= maxcpus) {
1448 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1449 " Processor ignored.\n", maxcpus);
1450 return;
1451 }
1452
1453 num_processors++; 1446 num_processors++;
1454 cpus_complement(tmp_map, cpu_present_map); 1447 cpus_complement(tmp_map, cpu_present_map);
1455 cpu = first_cpu(tmp_map); 1448 cpu = first_cpu(tmp_map);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9ee24e6bc4b0..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,12 +228,12 @@
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h> 230#include <linux/jiffies.h>
231#include <linux/smp_lock.h>
232 231
233#include <asm/system.h> 232#include <asm/system.h>
234#include <asm/uaccess.h> 233#include <asm/uaccess.h>
235#include <asm/desc.h> 234#include <asm/desc.h>
236#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
237#include <asm/paravirt.h> 237#include <asm/paravirt.h>
238#include <asm/reboot.h> 238#include <asm/reboot.h>
239 239
@@ -2217,7 +2217,7 @@ static int __init apm_init(void)
2217 2217
2218 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2219 2219
2220 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2221 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2222 return -ENODEV; 2222 return -ENODEV;
2223 } 2223 }
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391c..fdd585f9c53d 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
25{ 25{
26 const char *str; 26 const char *str;
27 switch (status) { 27 switch (status) {
28 case 0: str = "Call completed without error"; break; 28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break; 29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break; 30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break; 31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break; 32 default: str = "Unknown BIOS status code"; break;
33 } 33 }
34 return str; 34 return str;
35} 35}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 84a8220a6072..a6ef672adbba 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -56,9 +56,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
56 56
57 switch (c->x86_vendor) { 57 switch (c->x86_vendor) {
58 case X86_VENDOR_INTEL: 58 case X86_VENDOR_INTEL:
59 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) 59 /*
60 * There is a known erratum on Pentium III and Core Solo
61 * and Core Duo CPUs.
62 * " Page with PAT set to WC while associated MTRR is UC
63 * may consolidate to UC "
64 * Because of this erratum, it is better to stick with
65 * setting WC in MTRR rather than using PAT on these CPUs.
66 *
67 * Enable PAT WC only on P4, Core 2 or later CPUs.
68 */
69 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
60 return; 70 return;
61 break; 71
72 pat_disable("PAT WC disabled due to known CPU erratum.");
73 return;
74
62 case X86_VENDOR_AMD: 75 case X86_VENDOR_AMD:
63 case X86_VENDOR_CENTAUR: 76 case X86_VENDOR_CENTAUR:
64 case X86_VENDOR_TRANSMETA: 77 case X86_VENDOR_TRANSMETA:
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index cae9cabc3031..18514ed26104 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -31,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
31 if (c->x86_power & (1<<8)) 31 if (c->x86_power & (1<<8))
32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
33 } 33 }
34
35 /* Set MTRR capability flag if appropriate */
36 if (c->x86_model == 13 || c->x86_model == 9 ||
37 (c->x86_model == 8 && c->x86_mask >= 8))
38 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
34} 39}
35 40
36static void __cpuinit init_amd(struct cpuinfo_x86 *c) 41static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -166,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
166 mbytes); 171 mbytes);
167 } 172 }
168 173
169 /* Set MTRR capability flag if appropriate */
170 if (c->x86_model == 13 || c->x86_model == 9 ||
171 (c->x86_model == 8 && c->x86_mask >= 8))
172 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
173 break; 174 break;
174 } 175 }
175 176
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e0f45edd6a55..a0534c04d38a 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -314,6 +314,16 @@ enum {
314 EAMD3D = 1<<20, 314 EAMD3D = 1<<20,
315}; 315};
316 316
317static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
318{
319 switch (c->x86) {
320 case 5:
321 /* Emulate MTRRs using Centaur's MCR. */
322 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
323 break;
324 }
325}
326
317static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 327static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
318{ 328{
319 329
@@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
462static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 472static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
463 .c_vendor = "Centaur", 473 .c_vendor = "Centaur",
464 .c_ident = { "CentaurHauls" }, 474 .c_ident = { "CentaurHauls" },
475 .c_early_init = early_init_centaur,
465 .c_init = init_centaur, 476 .c_init = init_centaur,
466 .c_size_cache = centaur_size_cache, 477 .c_size_cache = centaur_size_cache,
467}; 478};
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80ab20d4fa39..4e456bd955bb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <asm/mtrr.h> 13#include <asm/mtrr.h>
14#include <asm/mce.h> 14#include <asm/mce.h>
15#include <asm/pat.h> 15#include <asm/pat.h>
16#include <asm/asm.h>
16#ifdef CONFIG_X86_LOCAL_APIC 17#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h> 18#include <asm/mpspec.h>
18#include <asm/apic.h> 19#include <asm/apic.h>
@@ -334,11 +335,24 @@ static void __init early_cpu_detect(void)
334 335
335 get_cpu_vendor(c, 1); 336 get_cpu_vendor(c, 1);
336 337
338 early_get_cap(c);
339
337 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 340 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
338 cpu_devs[c->x86_vendor]->c_early_init) 341 cpu_devs[c->x86_vendor]->c_early_init)
339 cpu_devs[c->x86_vendor]->c_early_init(c); 342 cpu_devs[c->x86_vendor]->c_early_init(c);
343}
340 344
341 early_get_cap(c); 345/*
346 * The NOPL instruction is supposed to exist on all CPUs with
347 * family >= 6; unfortunately, that's not true in practice because
348 * of early VIA chips and (more importantly) broken virtualizers that
349 * are not easy to detect. In the latter case it doesn't even *fail*
350 * reliably, so probing for it doesn't even work. Disable it completely
351 * unless we can find a reliable way to detect all the broken cases.
352 */
353static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
354{
355 clear_cpu_cap(c, X86_FEATURE_NOPL);
342} 356}
343 357
344static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 358static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -395,8 +409,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
395 } 409 }
396 410
397 init_scattered_cpuid_features(c); 411 init_scattered_cpuid_features(c);
412 detect_nopl(c);
398 } 413 }
399
400} 414}
401 415
402static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 416static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index cc6efe86249d..43f1aa51da5d 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -18,6 +18,7 @@
18#include <asm/mtrr.h> 18#include <asm/mtrr.h>
19#include <asm/mce.h> 19#include <asm/mce.h>
20#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/asm.h>
21#include <asm/numa.h> 22#include <asm/numa.h>
22#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
23#include <asm/mpspec.h> 24#include <asm/mpspec.h>
@@ -215,6 +216,39 @@ static void __init early_cpu_support_print(void)
215 } 216 }
216} 217}
217 218
219/*
220 * The NOPL instruction is supposed to exist on all CPUs with
221 * family >= 6, unfortunately, that's not true in practice because
222 * of early VIA chips and (more importantly) broken virtualizers that
223 * are not easy to detect. Hence, probe for it based on first
224 * principles.
225 *
226 * Note: no 64-bit chip is known to lack these, but put the code here
227 * for consistency with 32 bits, and to make it utterly trivial to
228 * diagnose the problem should it ever surface.
229 */
230static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
231{
232 const u32 nopl_signature = 0x888c53b1; /* Random number */
233 u32 has_nopl = nopl_signature;
234
235 clear_cpu_cap(c, X86_FEATURE_NOPL);
236 if (c->x86 >= 6) {
237 asm volatile("\n"
238 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
239 "2:\n"
240 " .section .fixup,\"ax\"\n"
241 "3: xor %0,%0\n"
242 " jmp 2b\n"
243 " .previous\n"
244 _ASM_EXTABLE(1b,3b)
245 : "+a" (has_nopl));
246
247 if (has_nopl == nopl_signature)
248 set_cpu_cap(c, X86_FEATURE_NOPL);
249 }
250}
251
218static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); 252static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
219 253
220void __init early_cpu_init(void) 254void __init early_cpu_init(void)
@@ -313,6 +347,8 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
313 c->x86_phys_bits = eax & 0xff; 347 c->x86_phys_bits = eax & 0xff;
314 } 348 }
315 349
350 detect_nopl(c);
351
316 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 352 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
317 cpu_devs[c->x86_vendor]->c_early_init) 353 cpu_devs[c->x86_vendor]->c_early_init)
318 cpu_devs[c->x86_vendor]->c_early_init(c); 354 cpu_devs[c->x86_vendor]->c_early_init(c);
@@ -394,6 +430,49 @@ static __init int setup_noclflush(char *arg)
394} 430}
395__setup("noclflush", setup_noclflush); 431__setup("noclflush", setup_noclflush);
396 432
433struct msr_range {
434 unsigned min;
435 unsigned max;
436};
437
438static struct msr_range msr_range_array[] __cpuinitdata = {
439 { 0x00000000, 0x00000418},
440 { 0xc0000000, 0xc000040b},
441 { 0xc0010000, 0xc0010142},
442 { 0xc0011000, 0xc001103b},
443};
444
445static void __cpuinit print_cpu_msr(void)
446{
447 unsigned index;
448 u64 val;
449 int i;
450 unsigned index_min, index_max;
451
452 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
453 index_min = msr_range_array[i].min;
454 index_max = msr_range_array[i].max;
455 for (index = index_min; index < index_max; index++) {
456 if (rdmsrl_amd_safe(index, &val))
457 continue;
458 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
459 }
460 }
461}
462
463static int show_msr __cpuinitdata;
464static __init int setup_show_msr(char *arg)
465{
466 int num;
467
468 get_option(&arg, &num);
469
470 if (num > 0)
471 show_msr = num;
472 return 1;
473}
474__setup("show_msr=", setup_show_msr);
475
397void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 476void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
398{ 477{
399 if (c->x86_model_id[0]) 478 if (c->x86_model_id[0])
@@ -403,6 +482,14 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
403 printk(KERN_CONT " stepping %02x\n", c->x86_mask); 482 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
404 else 483 else
405 printk(KERN_CONT "\n"); 484 printk(KERN_CONT "\n");
485
486#ifdef CONFIG_SMP
487 if (c->cpu_index < show_msr)
488 print_cpu_msr();
489#else
490 if (show_msr)
491 print_cpu_msr();
492#endif
406} 493}
407 494
408static __init int setup_disablecpuid(char *arg) 495static __init int setup_disablecpuid(char *arg)
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fbd..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 4e7271999a74..84bb395038d8 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -737,63 +737,44 @@ static int find_psb_table(struct powernow_k8_data *data)
737#ifdef CONFIG_X86_POWERNOW_K8_ACPI 737#ifdef CONFIG_X86_POWERNOW_K8_ACPI
738static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) 738static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index)
739{ 739{
740 if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE)) 740 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
741 return; 741 return;
742 742
743 data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK; 743 data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK;
744 data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK; 744 data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK;
745 data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; 745 data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
746 data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; 746 data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK;
747 data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK); 747 data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK);
748 data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK; 748 data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK;
749}
750
751
752static struct acpi_processor_performance *acpi_perf_data;
753static int preregister_valid;
754
755static int powernow_k8_cpu_preinit_acpi(void)
756{
757 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
758 if (!acpi_perf_data)
759 return -ENODEV;
760
761 if (acpi_processor_preregister_performance(acpi_perf_data))
762 return -ENODEV;
763 else
764 preregister_valid = 1;
765 return 0;
766} 749}
767 750
768static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) 751static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
769{ 752{
770 struct cpufreq_frequency_table *powernow_table; 753 struct cpufreq_frequency_table *powernow_table;
771 int ret_val; 754 int ret_val;
772 int cpu = 0;
773 755
774 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 756 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
775 if (acpi_processor_register_performance(data->acpi_data, data->cpu)) {
776 dprintk("register performance failed: bad ACPI data\n"); 757 dprintk("register performance failed: bad ACPI data\n");
777 return -EIO; 758 return -EIO;
778 } 759 }
779 760
780 /* verify the data contained in the ACPI structures */ 761 /* verify the data contained in the ACPI structures */
781 if (data->acpi_data->state_count <= 1) { 762 if (data->acpi_data.state_count <= 1) {
782 dprintk("No ACPI P-States\n"); 763 dprintk("No ACPI P-States\n");
783 goto err_out; 764 goto err_out;
784 } 765 }
785 766
786 if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || 767 if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
787 (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { 768 (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
788 dprintk("Invalid control/status registers (%x - %x)\n", 769 dprintk("Invalid control/status registers (%x - %x)\n",
789 data->acpi_data->control_register.space_id, 770 data->acpi_data.control_register.space_id,
790 data->acpi_data->status_register.space_id); 771 data->acpi_data.status_register.space_id);
791 goto err_out; 772 goto err_out;
792 } 773 }
793 774
794 /* fill in data->powernow_table */ 775 /* fill in data->powernow_table */
795 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) 776 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
796 * (data->acpi_data->state_count + 1)), GFP_KERNEL); 777 * (data->acpi_data.state_count + 1)), GFP_KERNEL);
797 if (!powernow_table) { 778 if (!powernow_table) {
798 dprintk("powernow_table memory alloc failure\n"); 779 dprintk("powernow_table memory alloc failure\n");
799 goto err_out; 780 goto err_out;
@@ -806,12 +787,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
806 if (ret_val) 787 if (ret_val)
807 goto err_out_mem; 788 goto err_out_mem;
808 789
809 powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END; 790 powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END;
810 powernow_table[data->acpi_data->state_count].index = 0; 791 powernow_table[data->acpi_data.state_count].index = 0;
811 data->powernow_table = powernow_table; 792 data->powernow_table = powernow_table;
812 793
813 /* fill in data */ 794 /* fill in data */
814 data->numps = data->acpi_data->state_count; 795 data->numps = data->acpi_data.state_count;
815 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) 796 if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu)
816 print_basics(data); 797 print_basics(data);
817 powernow_k8_acpi_pst_values(data, 0); 798 powernow_k8_acpi_pst_values(data, 0);
@@ -819,31 +800,16 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
819 /* notify BIOS that we exist */ 800 /* notify BIOS that we exist */
820 acpi_processor_notify_smm(THIS_MODULE); 801 acpi_processor_notify_smm(THIS_MODULE);
821 802
822 /* determine affinity, from ACPI if available */
823 if (preregister_valid) {
824 if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) ||
825 (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY))
826 data->starting_core_affinity = data->acpi_data->shared_cpu_map;
827 else
828 data->starting_core_affinity = cpumask_of_cpu(data->cpu);
829 } else {
830 /* best guess from family if not */
831 if (cpu_family == CPU_HW_PSTATE)
832 data->starting_core_affinity = cpumask_of_cpu(data->cpu);
833 else
834 data->starting_core_affinity = per_cpu(cpu_core_map, data->cpu);
835 }
836
837 return 0; 803 return 0;
838 804
839err_out_mem: 805err_out_mem:
840 kfree(powernow_table); 806 kfree(powernow_table);
841 807
842err_out: 808err_out:
843 acpi_processor_unregister_performance(data->acpi_data, data->cpu); 809 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
844 810
845 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ 811 /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */
846 data->acpi_data->state_count = 0; 812 data->acpi_data.state_count = 0;
847 813
848 return -ENODEV; 814 return -ENODEV;
849} 815}
@@ -855,10 +821,10 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
855 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); 821 rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
856 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; 822 data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
857 823
858 for (i = 0; i < data->acpi_data->state_count; i++) { 824 for (i = 0; i < data->acpi_data.state_count; i++) {
859 u32 index; 825 u32 index;
860 826
861 index = data->acpi_data->states[i].control & HW_PSTATE_MASK; 827 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
862 if (index > data->max_hw_pstate) { 828 if (index > data->max_hw_pstate) {
863 printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); 829 printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index);
864 printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); 830 printk(KERN_ERR PFX "Please report to BIOS manufacturer\n");
@@ -874,7 +840,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
874 840
875 powernow_table[i].index = index; 841 powernow_table[i].index = index;
876 842
877 powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000; 843 powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000;
878 } 844 }
879 return 0; 845 return 0;
880} 846}
@@ -883,16 +849,16 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
883{ 849{
884 int i; 850 int i;
885 int cntlofreq = 0; 851 int cntlofreq = 0;
886 for (i = 0; i < data->acpi_data->state_count; i++) { 852 for (i = 0; i < data->acpi_data.state_count; i++) {
887 u32 fid; 853 u32 fid;
888 u32 vid; 854 u32 vid;
889 855
890 if (data->exttype) { 856 if (data->exttype) {
891 fid = data->acpi_data->states[i].status & EXT_FID_MASK; 857 fid = data->acpi_data.states[i].status & EXT_FID_MASK;
892 vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK; 858 vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK;
893 } else { 859 } else {
894 fid = data->acpi_data->states[i].control & FID_MASK; 860 fid = data->acpi_data.states[i].control & FID_MASK;
895 vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK; 861 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK;
896 } 862 }
897 863
898 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); 864 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
@@ -933,10 +899,10 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
933 cntlofreq = i; 899 cntlofreq = i;
934 } 900 }
935 901
936 if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { 902 if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) {
937 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", 903 printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n",
938 powernow_table[i].frequency, 904 powernow_table[i].frequency,
939 (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); 905 (unsigned int) (data->acpi_data.states[i].core_frequency * 1000));
940 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; 906 powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
941 continue; 907 continue;
942 } 908 }
@@ -946,12 +912,11 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf
946 912
947static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) 913static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
948{ 914{
949 if (data->acpi_data->state_count) 915 if (data->acpi_data.state_count)
950 acpi_processor_unregister_performance(data->acpi_data, data->cpu); 916 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
951} 917}
952 918
953#else 919#else
954static int powernow_k8_cpu_preinit_acpi(void) { return -ENODEV; }
955static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } 920static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; }
956static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } 921static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; }
957static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } 922static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; }
@@ -1136,7 +1101,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
1136static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) 1101static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1137{ 1102{
1138 struct powernow_k8_data *data; 1103 struct powernow_k8_data *data;
1139 cpumask_t oldmask = CPU_MASK_ALL; 1104 cpumask_t oldmask;
1140 int rc; 1105 int rc;
1141 1106
1142 if (!cpu_online(pol->cpu)) 1107 if (!cpu_online(pol->cpu))
@@ -1209,7 +1174,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1209 /* run on any CPU again */ 1174 /* run on any CPU again */
1210 set_cpus_allowed_ptr(current, &oldmask); 1175 set_cpus_allowed_ptr(current, &oldmask);
1211 1176
1212 pol->cpus = data->starting_core_affinity; 1177 if (cpu_family == CPU_HW_PSTATE)
1178 pol->cpus = cpumask_of_cpu(pol->cpu);
1179 else
1180 pol->cpus = per_cpu(cpu_core_map, pol->cpu);
1213 data->available_cores = &(pol->cpus); 1181 data->available_cores = &(pol->cpus);
1214 1182
1215 /* Take a crude guess here. 1183 /* Take a crude guess here.
@@ -1332,7 +1300,6 @@ static int __cpuinit powernowk8_init(void)
1332 } 1300 }
1333 1301
1334 if (supported_cpus == num_online_cpus()) { 1302 if (supported_cpus == num_online_cpus()) {
1335 powernow_k8_cpu_preinit_acpi();
1336 printk(KERN_INFO PFX "Found %d %s " 1303 printk(KERN_INFO PFX "Found %d %s "
1337 "processors (%d cpu cores) (" VERSION ")\n", 1304 "processors (%d cpu cores) (" VERSION ")\n",
1338 num_online_nodes(), 1305 num_online_nodes(),
@@ -1349,10 +1316,6 @@ static void __exit powernowk8_exit(void)
1349 dprintk("exit\n"); 1316 dprintk("exit\n");
1350 1317
1351 cpufreq_unregister_driver(&cpufreq_amd64_driver); 1318 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1352
1353#ifdef CONFIG_X86_POWERNOW_K8_ACPI
1354 free_percpu(acpi_perf_data);
1355#endif
1356} 1319}
1357 1320
1358MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>"); 1321MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and Mark Langsdorf <mark.langsdorf@amd.com>");
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index a62612cd4be8..ab48cfed4d96 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -33,13 +33,12 @@ struct powernow_k8_data {
33#ifdef CONFIG_X86_POWERNOW_K8_ACPI 33#ifdef CONFIG_X86_POWERNOW_K8_ACPI
34 /* the acpi table needs to be kept. it's only available if ACPI was 34 /* the acpi table needs to be kept. it's only available if ACPI was
35 * used to determine valid frequency/vid/fid states */ 35 * used to determine valid frequency/vid/fid states */
36 struct acpi_processor_performance *acpi_data; 36 struct acpi_processor_performance acpi_data;
37#endif 37#endif
38 /* we need to keep track of associated cores, but let cpufreq 38 /* we need to keep track of associated cores, but let cpufreq
39 * handle hotplug events - so just point at cpufreq pol->cpus 39 * handle hotplug events - so just point at cpufreq pol->cpus
40 * structure */ 40 * structure */
41 cpumask_t *available_cores; 41 cpumask_t *available_cores;
42 cpumask_t starting_core_affinity;
43}; 42};
44 43
45 44
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 15e13c01cc36..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) \ 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3fd7a67bb06a..898a5a2002ed 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -15,13 +15,11 @@
15/* 15/*
16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU 16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
17 */ 17 */
18static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) 18static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
19{ 19{
20 unsigned char ccr2, ccr3; 20 unsigned char ccr2, ccr3;
21 unsigned long flags;
22 21
23 /* we test for DEVID by checking whether CCR3 is writable */ 22 /* we test for DEVID by checking whether CCR3 is writable */
24 local_irq_save(flags);
25 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
26 setCx86(CX86_CCR3, ccr3 ^ 0x80); 24 setCx86(CX86_CCR3, ccr3 ^ 0x80);
27 getCx86(0xc0); /* dummy to change bus */ 25 getCx86(0xc0); /* dummy to change bus */
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
44 *dir0 = getCx86(CX86_DIR0); 42 *dir0 = getCx86(CX86_DIR0);
45 *dir1 = getCx86(CX86_DIR1); 43 *dir1 = getCx86(CX86_DIR1);
46 } 44 }
47 local_irq_restore(flags);
48} 45}
49 46
47static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
48{
49 unsigned long flags;
50
51 local_irq_save(flags);
52 __do_cyrix_devid(dir0, dir1);
53 local_irq_restore(flags);
54}
50/* 55/*
51 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in 56 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
52 * order to identify the Cyrix CPU model after we're out of setup.c 57 * order to identify the Cyrix CPU model after we're out of setup.c
@@ -134,23 +139,6 @@ static void __cpuinit set_cx86_memwb(void)
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
135} 140}
136 141
137static void __cpuinit set_cx86_inc(void)
138{
139 unsigned char ccr3;
140
141 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
142
143 ccr3 = getCx86(CX86_CCR3);
144 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
145 /* PCR1 -- Performance Control */
146 /* Incrementor on, whatever that is */
147 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
148 /* PCR0 -- Performance Control */
149 /* Incrementor Margin 10 */
150 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
151 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
152}
153
154/* 142/*
155 * Configure later MediaGX and/or Geode processor. 143 * Configure later MediaGX and/or Geode processor.
156 */ 144 */
@@ -174,11 +162,28 @@ static void __cpuinit geode_configure(void)
174 162
175 set_cx86_memwb(); 163 set_cx86_memwb();
176 set_cx86_reorder(); 164 set_cx86_reorder();
177 set_cx86_inc();
178 165
179 local_irq_restore(flags); 166 local_irq_restore(flags);
180} 167}
181 168
169static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
170{
171 unsigned char dir0, dir0_msn, dir1 = 0;
172
173 __do_cyrix_devid(&dir0, &dir1);
174 dir0_msn = dir0 >> 4; /* identifies CPU "family" */
175
176 switch (dir0_msn) {
177 case 3: /* 6x86/6x86L */
178 /* Emulate MTRRs using Cyrix's ARRs. */
179 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
180 break;
181 case 5: /* 6x86MX/M II */
182 /* Emulate MTRRs using Cyrix's ARRs. */
183 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
184 break;
185 }
186}
182 187
183static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 188static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
184{ 189{
@@ -434,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
434static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 439static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
435 .c_vendor = "Cyrix", 440 .c_vendor = "Cyrix",
436 .c_ident = { "CyrixInstead" }, 441 .c_ident = { "CyrixInstead" },
442 .c_early_init = early_init_cyrix,
437 .c_init = init_cyrix, 443 .c_init = init_cyrix,
438 .c_identify = cyrix_identify, 444 .c_identify = cyrix_identify,
439}; 445};
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
index 0bf4d37a0483..b96b69545fbf 100644
--- a/arch/x86/kernel/cpu/feature_names.c
+++ b/arch/x86/kernel/cpu/feature_names.c
@@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = {
39 NULL, NULL, NULL, NULL, 39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon", 40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL, 41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, 42 "rep_good", NULL, NULL, NULL,
43 "nopl", NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 44 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44 45
45 /* Intel-defined (#2) */ 46 /* Intel-defined (#2) */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b75f2569b8f8..f113ef4595f6 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -222,10 +222,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
222 set_cpu_cap(c, X86_FEATURE_BTS); 222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12))) 223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS); 224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 ds_init_intel(c);
225 } 226 }
226 227
227 if (cpu_has_bts) 228 if (cpu_has_bts)
228 ds_init_intel(c); 229 ptrace_bts_init_intel(c);
229 230
230 /* 231 /*
231 * See if we have a good local APIC by checking for buggy Pentia, 232 * See if we have a good local APIC by checking for buggy Pentia,
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 65a339678ece..726a5fcdf341 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -759,6 +759,7 @@ static struct sysdev_class mce_sysclass = {
759}; 759};
760 760
761DEFINE_PER_CPU(struct sys_device, device_mce); 761DEFINE_PER_CPU(struct sys_device, device_mce);
762void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
762 763
763/* Why are there no generic functions for this? */ 764/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 765#define ACCESSOR(name, var, start) \
@@ -883,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
883 case CPU_ONLINE: 884 case CPU_ONLINE:
884 case CPU_ONLINE_FROZEN: 885 case CPU_ONLINE_FROZEN:
885 mce_create_device(cpu); 886 mce_create_device(cpu);
887 if (threshold_cpu_callback)
888 threshold_cpu_callback(action, cpu);
886 break; 889 break;
887 case CPU_DEAD: 890 case CPU_DEAD:
888 case CPU_DEAD_FROZEN: 891 case CPU_DEAD_FROZEN:
892 if (threshold_cpu_callback)
893 threshold_cpu_callback(action, cpu);
889 mce_remove_device(cpu); 894 mce_remove_device(cpu);
890 break; 895 break;
891 } 896 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 88736cadbaa6..5eb390a4b2e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
628 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
629 629
630free_out: 630free_out:
631 kobject_del(b->kobj);
631 kobject_put(b->kobj); 632 kobject_put(b->kobj);
632 kfree(b); 633 kfree(b);
633 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
@@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu)
645} 646}
646 647
647/* get notified when a cpu comes on/off */ 648/* get notified when a cpu comes on/off */
648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, 649static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
649 unsigned long action, void *hcpu) 650 unsigned int cpu)
650{ 651{
651 /* cpu was unsigned int to begin with */
652 unsigned int cpu = (unsigned long)hcpu;
653
654 if (cpu >= NR_CPUS) 652 if (cpu >= NR_CPUS)
655 goto out; 653 return;
656 654
657 switch (action) { 655 switch (action) {
658 case CPU_ONLINE: 656 case CPU_ONLINE:
@@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
666 default: 664 default:
667 break; 665 break;
668 } 666 }
669 out:
670 return NOTIFY_OK;
671} 667}
672 668
673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
674 .notifier_call = threshold_cpu_callback,
675};
676
677static __init int threshold_init_device(void) 669static __init int threshold_init_device(void)
678{ 670{
679 unsigned lcpu = 0; 671 unsigned lcpu = 0;
@@ -684,7 +676,7 @@ static __init int threshold_init_device(void)
684 if (err) 676 if (err)
685 return err; 677 return err;
686 } 678 }
687 register_hotcpu_notifier(&threshold_cpu_notifier); 679 threshold_cpu_callback = amd_64_threshold_cpu_callback;
688 return 0; 680 return 0;
689} 681}
690 682
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 509bd3d9eacd..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
379 unsigned long *size, mtrr_type *type) 379 unsigned long *size, mtrr_type *type)
380{ 380{
381 unsigned int mask_lo, mask_hi, base_lo, base_hi; 381 unsigned int mask_lo, mask_hi, base_lo, base_hi;
382 unsigned int tmp, hi;
382 383
383 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 384 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
384 if ((mask_lo & 0x800) == 0) { 385 if ((mask_lo & 0x800) == 0) {
@@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
392 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 393 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
393 394
394 /* Work out the shifted address mask. */ 395 /* Work out the shifted address mask. */
395 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) 396 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
396 | mask_lo >> PAGE_SHIFT; 397 mask_lo = size_or_mask | tmp;
398 /* Expand tmp with high bits to all 1s*/
399 hi = fls(tmp);
400 if (hi > 0) {
401 tmp |= ~((1<<(hi - 1)) - 1);
402
403 if (tmp != mask_lo) {
404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405 mask_lo = tmp;
406 }
407 }
397 408
398 /* This works correctly if size is a power of two, i.e. a 409 /* This works correctly if size is a power of two, i.e. a
399 contiguous range. */ 410 contiguous range. */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index b5ade28ca8f8..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
@@ -1496,11 +1600,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1496 1600
1497 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1601 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1498 if (!highest_pfn) { 1602 if (!highest_pfn) {
1499 if (!kvm_para_available()) { 1603 WARN(!kvm_para_available(), KERN_WARNING
1500 printk(KERN_WARNING
1501 "WARNING: strange, CPU MTRRs all blank?\n"); 1604 "WARNING: strange, CPU MTRRs all blank?\n");
1502 WARN_ON(1);
1503 }
1504 return 0; 1605 return 0;
1505 } 1606 }
1506 1607
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index de7439f82b92..6bff382094f5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 295 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 296 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 298
299 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 300 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 301 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 302 wd->cccr_msr = 0; /* unused */
303
304 /* ok, everything is initialized, announce that we're set */
305 cpu_nmi_set_wd_enabled();
306
307 apic_write(APIC_LVTPC, APIC_DM_NMI);
308 evntsel |= K7_EVNTSEL_ENABLE;
309 wrmsr(evntsel_msr, evntsel, 0);
310
305 return 1; 311 return 1;
306} 312}
307 313
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 385 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 386 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 387 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 388
389 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 390 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 391 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 392 wd->cccr_msr = 0; /* unused */
393
394 /* ok, everything is initialized, announce that we're set */
395 cpu_nmi_set_wd_enabled();
396
397 apic_write(APIC_LVTPC, APIC_DM_NMI);
398 evntsel |= P6_EVNTSEL0_ENABLE;
399 wrmsr(evntsel_msr, evntsel, 0);
400
389 return 1; 401 return 1;
390} 402}
391 403
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 444#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 445#define P4_CCCR_OVF (1 << 31)
434 446
447#define P4_CONTROLS 18
448static unsigned int p4_controls[18] = {
449 MSR_P4_BPU_CCCR0,
450 MSR_P4_BPU_CCCR1,
451 MSR_P4_BPU_CCCR2,
452 MSR_P4_BPU_CCCR3,
453 MSR_P4_MS_CCCR0,
454 MSR_P4_MS_CCCR1,
455 MSR_P4_MS_CCCR2,
456 MSR_P4_MS_CCCR3,
457 MSR_P4_FLAME_CCCR0,
458 MSR_P4_FLAME_CCCR1,
459 MSR_P4_FLAME_CCCR2,
460 MSR_P4_FLAME_CCCR3,
461 MSR_P4_IQ_CCCR0,
462 MSR_P4_IQ_CCCR1,
463 MSR_P4_IQ_CCCR2,
464 MSR_P4_IQ_CCCR3,
465 MSR_P4_IQ_CCCR4,
466 MSR_P4_IQ_CCCR5,
467};
435/* 468/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 469 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 470 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,12 +506,38 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 506 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 507 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 508 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
509
510 /*
511 * If we're on the kdump kernel or other situation, we may
512 * still have other performance counter registers set to
513 * interrupt and they'll keep interrupting forever because
514 * of the P4_CCCR_OVF quirk. So we need to ACK all the
515 * pending interrupts and disable all the registers here,
516 * before reenabling the NMI delivery. Refer to p4_rearm()
517 * about the P4_CCCR_OVF quirk.
518 */
519 if (reset_devices) {
520 unsigned int low, high;
521 int i;
522
523 for (i = 0; i < P4_CONTROLS; i++) {
524 rdmsr(p4_controls[i], low, high);
525 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
526 wrmsr(p4_controls[i], low, high);
527 }
528 }
476 } else { 529 } else {
477 /* logical cpu 1 */ 530 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 531 perfctr_msr = MSR_P4_IQ_PERFCTR1;
479 evntsel_msr = MSR_P4_CRU_ESCR0; 532 evntsel_msr = MSR_P4_CRU_ESCR0;
480 cccr_msr = MSR_P4_IQ_CCCR1; 533 cccr_msr = MSR_P4_IQ_CCCR1;
481 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); 534
535 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
536 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
537 cccr_val = P4_CCCR_OVF_PMI0;
538 else
539 cccr_val = P4_CCCR_OVF_PMI1;
540 cccr_val |= P4_CCCR_ESCR_SELECT(4);
482 } 541 }
483 542
484 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 543 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
@@ -493,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
493 wrmsr(evntsel_msr, evntsel, 0); 552 wrmsr(evntsel_msr, evntsel, 0);
494 wrmsr(cccr_msr, cccr_val, 0); 553 wrmsr(cccr_msr, cccr_val, 0);
495 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 554 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
496 apic_write(APIC_LVTPC, APIC_DM_NMI); 555
497 cccr_val |= P4_CCCR_ENABLE;
498 wrmsr(cccr_msr, cccr_val, 0);
499 wd->perfctr_msr = perfctr_msr; 556 wd->perfctr_msr = perfctr_msr;
500 wd->evntsel_msr = evntsel_msr; 557 wd->evntsel_msr = evntsel_msr;
501 wd->cccr_msr = cccr_msr; 558 wd->cccr_msr = cccr_msr;
559
560 /* ok, everything is initialized, announce that we're set */
561 cpu_nmi_set_wd_enabled();
562
563 apic_write(APIC_LVTPC, APIC_DM_NMI);
564 cccr_val |= P4_CCCR_ENABLE;
565 wrmsr(cccr_msr, cccr_val, 0);
502 return 1; 566 return 1;
503} 567}
504 568
@@ -614,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
614 wrmsr(evntsel_msr, evntsel, 0); 678 wrmsr(evntsel_msr, evntsel, 0);
615 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 679 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
616 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 680 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
617 apic_write(APIC_LVTPC, APIC_DM_NMI);
618 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
619 wrmsr(evntsel_msr, evntsel, 0);
620 681
621 wd->perfctr_msr = perfctr_msr; 682 wd->perfctr_msr = perfctr_msr;
622 wd->evntsel_msr = evntsel_msr; 683 wd->evntsel_msr = evntsel_msr;
623 wd->cccr_msr = 0; /* unused */ 684 wd->cccr_msr = 0; /* unused */
685
686 /* ok, everything is initialized, announce that we're set */
687 cpu_nmi_set_wd_enabled();
688
689 apic_write(APIC_LVTPC, APIC_DM_NMI);
690 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
691 wrmsr(evntsel_msr, evntsel, 0);
624 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 692 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
625 return 1; 693 return 1;
626} 694}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 14b11b3be31c..6a44d6465991 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/major.h> 37#include <linux/major.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/smp_lock.h>
40#include <linux/device.h> 39#include <linux/device.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
@@ -89,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
89 struct cpuid_regs cmd; 88 struct cpuid_regs cmd;
90 int cpu = iminor(file->f_path.dentry->d_inode); 89 int cpu = iminor(file->f_path.dentry->d_inode);
91 u64 pos = *ppos; 90 u64 pos = *ppos;
91 ssize_t bytes = 0;
92 int err = 0;
92 93
93 if (count % 16) 94 if (count % 16)
94 return -EINVAL; /* Invalid chunk size */ 95 return -EINVAL; /* Invalid chunk size */
@@ -96,14 +97,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
96 for (; count; count -= 16) { 97 for (; count; count -= 16) {
97 cmd.eax = pos; 98 cmd.eax = pos;
98 cmd.ecx = pos >> 32; 99 cmd.ecx = pos >> 32;
99 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); 100 err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
100 if (copy_to_user(tmp, &cmd, 16)) 101 if (err)
101 return -EFAULT; 102 break;
103 if (copy_to_user(tmp, &cmd, 16)) {
104 err = -EFAULT;
105 break;
106 }
102 tmp += 16; 107 tmp += 16;
108 bytes += 16;
103 *ppos = ++pos; 109 *ppos = ++pos;
104 } 110 }
105 111
106 return tmp - buf; 112 return bytes ? bytes : err;
107} 113}
108 114
109static int cpuid_open(struct inode *inode, struct file *file) 115static int cpuid_open(struct inode *inode, struct file *file)
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..e90a60ef10c2 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10 10#include <linux/uaccess.h>
11#include <asm/uaccess.h> 11#include <linux/io.h>
12#include <asm/io.h>
13 12
14/** 13/**
15 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 24 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 25 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 26ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 27 size_t csize, unsigned long offset, int userbuf)
29{ 28{
30 void *vaddr; 29 void *vaddr;
31 30
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 32 return 0;
34 33
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
35 if (!vaddr)
36 return -ENOMEM;
36 37
37 if (userbuf) { 38 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 39 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 40 iounmap(vaddr);
40 return -EFAULT; 41 return -EFAULT;
41 } 42 }
42 } else 43 } else
43 memcpy(buf, (vaddr + offset), csize); 44 memcpy(buf, vaddr + offset, csize);
44 45
45 iounmap(vaddr); 46 iounmap(vaddr);
46 return csize; 47 return csize;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 9af89078f7bb..66e48aa2dd1b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1203,7 +1203,7 @@ static int __init parse_memmap_opt(char *p)
1203 if (!p) 1203 if (!p)
1204 return -EINVAL; 1204 return -EINVAL;
1205 1205
1206 if (!strcmp(p, "exactmap")) { 1206 if (!strncmp(p, "exactmap", 8)) {
1207#ifdef CONFIG_CRASH_DUMP 1207#ifdef CONFIG_CRASH_DUMP
1208 /* 1208 /*
1209 * If we are doing a crash dump, we still need to know 1209 * If we are doing a crash dump, we still need to know
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fac..24bb5faf5efa 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,20 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98#ifdef CONFIG_DMAR
99static void __init intel_g33_dmar(int num, int slot, int func)
100{
101 struct acpi_table_header *dmar_tbl;
102 acpi_status status;
103
104 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
105 if (ACPI_SUCCESS(status)) {
106 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
107 dmar_disabled = 1;
108 }
109}
110#endif
111
98#define QFLAG_APPLY_ONCE 0x1 112#define QFLAG_APPLY_ONCE 0x1
99#define QFLAG_APPLIED 0x2 113#define QFLAG_APPLIED 0x2
100#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +128,10 @@ static struct chipset early_qrk[] __initdata = {
114 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
115 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 129 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
116 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 130 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
131#ifdef CONFIG_DMAR
132 { PCI_VENDOR_ID_INTEL, 0x29c0,
133 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
134#endif
117 {} 135 {}
118}; 136};
119 137
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..945a31cdd81f 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 414 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 415 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
417
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 418 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 419 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 420 "Kernel-defined memdesc doesn't match the one from EFI!\n");
421
420 if (add_efi_memmap) 422 if (add_efi_memmap)
421 do_add_efi_memmap(); 423 do_add_efi_memmap();
422 424
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 4b63c8e1f13b..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -53,7 +53,7 @@ void efi_call_phys_prelog(void)
53 * directory. If I have PAE, I just need to duplicate one entry in 53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory. 54 * page directory.
55 */ 55 */
56 cr4 = read_cr4(); 56 cr4 = read_cr4_safe();
57 57
58 if (cr4 & X86_CR4_PAE) { 58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd = 59 efi_bak_pg_dir_pointer[0].pgd =
@@ -91,7 +91,7 @@ void efi_call_phys_epilog(void)
91 gdt_descr.size = GDT_SIZE - 1; 91 gdt_descr.size = GDT_SIZE - 1;
92 load_gdt(&gdt_descr); 92 load_gdt(&gdt_descr);
93 93
94 cr4 = read_cr4(); 94 cr4 = read_cr4_safe();
95 95
96 if (cr4 & X86_CR4_PAE) { 96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd = 97 swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 16a93ed7baf1..ae2ffc8a400c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -286,7 +286,7 @@ static __init void map_low_mmrs(void)
286 286
287enum map_type {map_wb, map_uc}; 287enum map_type {map_wb, map_uc};
288 288
289static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 289static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
290{ 290{
291 unsigned long bytes, paddr; 291 unsigned long bytes, paddr;
292 292
@@ -357,7 +357,9 @@ static __init void uv_rtc_init(void)
357 sn_rtc_cycles_per_second = ticks_per_sec; 357 sn_rtc_cycles_per_second = ticks_per_sec;
358} 358}
359 359
360static __init void uv_system_init(void) 360static bool uv_system_inited;
361
362void __init uv_system_init(void)
361{ 363{
362 union uvh_si_addr_map_config_u m_n_config; 364 union uvh_si_addr_map_config_u m_n_config;
363 union uvh_node_id_u node_id; 365 union uvh_node_id_u node_id;
@@ -447,6 +449,7 @@ static __init void uv_system_init(void)
447 map_mmr_high(max_pnode); 449 map_mmr_high(max_pnode);
448 map_config_high(max_pnode); 450 map_config_high(max_pnode);
449 map_mmioh_high(max_pnode); 451 map_mmioh_high(max_pnode);
452 uv_system_inited = true;
450} 453}
451 454
452/* 455/*
@@ -455,8 +458,7 @@ static __init void uv_system_init(void)
455 */ 458 */
456void __cpuinit uv_cpu_init(void) 459void __cpuinit uv_cpu_init(void)
457{ 460{
458 if (!uv_node_to_blade) 461 BUG_ON(!uv_system_inited);
459 uv_system_init();
460 462
461 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; 463 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
462 464
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1b318e903bf6..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -88,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
90 (__START_KERNEL & PGDIR_MASK))); 90 (__START_KERNEL & PGDIR_MASK)));
91 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
91 92
92 /* clear bss before set_intr_gate with early_idt_handler */ 93 /* clear bss before set_intr_gate with early_idt_handler */
93 clear_bss(); 94 clear_bss();
@@ -107,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
107 } 108 }
108 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
109 110
110 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
112 early_printk("Kernel alive\n");
111 113
112 x86_64_init_pda(); 114 x86_64_init_pda();
113 115
114 early_printk("Kernel really alive\n");
115
116 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
117} 117}
118 118
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377a..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
634 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
635 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
636ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
638# if KPMDS == 3 634# if KPMDS == 3
639 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
640 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
641 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
642# elif KPMDS == 2 638# elif KPMDS == 2
643 .long 0,0 639 .long 0,0
644 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
645 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
646# elif KPMDS == 1 642# elif KPMDS == 1
647 .long 0,0 643 .long 0,0
648 .long 0,0 644 .long 0,0
649 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
650# else 646# else
651# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
652# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe886..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad2b15a1334d..73deaffadd03 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void)
210 /* Calculate the min / max delta */ 210 /* Calculate the min / max delta */
211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
212 &hpet_clockevent); 212 &hpet_clockevent);
213 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 213 /* 5 usec minimum reprogramming delta. */
214 &hpet_clockevent); 214 hpet_clockevent.min_delta_ns = 5000;
215 215
216 /* 216 /*
217 * Start hpet with the boot cpu mask and make it 217 * Start hpet with the boot cpu mask and make it
@@ -270,15 +270,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode,
270} 270}
271 271
272static int hpet_legacy_next_event(unsigned long delta, 272static int hpet_legacy_next_event(unsigned long delta,
273 struct clock_event_device *evt) 273 struct clock_event_device *evt)
274{ 274{
275 unsigned long cnt; 275 u32 cnt;
276 276
277 cnt = hpet_readl(HPET_COUNTER); 277 cnt = hpet_readl(HPET_COUNTER);
278 cnt += delta; 278 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 279 hpet_writel(cnt, HPET_T0_CMP);
280 280
281 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 281 /*
282 * We need to read back the CMP register to make sure that
283 * what we wrote hit the chip before we compare it to the
284 * counter.
285 */
286 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
287
288 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
282} 289}
283 290
284/* 291/*
@@ -359,6 +366,7 @@ static int hpet_clocksource_register(void)
359int __init hpet_enable(void) 366int __init hpet_enable(void)
360{ 367{
361 unsigned long id; 368 unsigned long id;
369 int i;
362 370
363 if (!is_hpet_capable()) 371 if (!is_hpet_capable())
364 return 0; 372 return 0;
@@ -369,6 +377,29 @@ int __init hpet_enable(void)
369 * Read the period and check for a sane value: 377 * Read the period and check for a sane value:
370 */ 378 */
371 hpet_period = hpet_readl(HPET_PERIOD); 379 hpet_period = hpet_readl(HPET_PERIOD);
380
381 /*
382 * AMD SB700 based systems with spread spectrum enabled use a
383 * SMM based HPET emulation to provide proper frequency
384 * setting. The SMM code is initialized with the first HPET
385 * register access and takes some time to complete. During
386 * this time the config register reads 0xffffffff. We check
387 * for max. 1000 loops whether the config register reads a non
388 * 0xffffffff value to make sure that HPET is up and running
389 * before we go further. A counting loop is safe, as the HPET
390 * access takes thousands of CPU cycles. On non SB700 based
391 * machines this check is only done once and has no side
392 * effects.
393 */
394 for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
395 if (i == 1000) {
396 printk(KERN_WARNING
397 "HPET config register value = 0xFFFFFFFF. "
398 "Disabling HPET\n");
399 goto out_nohpet;
400 }
401 }
402
372 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) 403 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
373 goto out_nohpet; 404 goto out_nohpet;
374 405
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 1c3a66a67f83..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc088..b71e02d42f4f 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ skip:
325 for_each_online_cpu(j) 325 for_each_online_cpu(j)
326 seq_printf(p, "%10u ", 326 seq_printf(p, "%10u ",
327 per_cpu(irq_stat,j).irq_call_count); 327 per_cpu(irq_stat,j).irq_call_count);
328 seq_printf(p, " function call interrupts\n"); 328 seq_printf(p, " Function call interrupts\n");
329 seq_printf(p, "TLB: "); 329 seq_printf(p, "TLB: ");
330 for_each_online_cpu(j) 330 for_each_online_cpu(j)
331 seq_printf(p, "%10u ", 331 seq_printf(p, "%10u ",
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..f065fe9071b9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ skip:
129 seq_printf(p, "CAL: "); 129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j) 130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); 131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n"); 132 seq_printf(p, " Function call interrupts\n");
133 seq_printf(p, "TLB: "); 133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j) 134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); 135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index f2d43bc75514..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
139 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
140 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
141 if (!data) { 141 if (!data) {
142 kfree(node);
142 error = -ENXIO; 143 error = -ENXIO;
143 goto err_dir; 144 goto err_dir;
144 } 145 }
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 9fe478d98406..0732adba05ca 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -12,6 +12,7 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h>
15 16
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
17#include <asm/pgalloc.h> 18#include <asm/pgalloc.h>
@@ -78,7 +79,7 @@ static void load_segments(void)
78/* 79/*
79 * A architecture hook called to validate the 80 * A architecture hook called to validate the
80 * proposed image and prepare the control pages 81 * proposed image and prepare the control pages
81 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE 82 * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
82 * have been allocated, but the segments have yet 83 * have been allocated, but the segments have yet
83 * been copied into the kernel. 84 * been copied into the kernel.
84 * 85 *
@@ -113,6 +114,7 @@ void machine_kexec(struct kimage *image)
113{ 114{
114 unsigned long page_list[PAGES_NR]; 115 unsigned long page_list[PAGES_NR];
115 void *control_page; 116 void *control_page;
117 int save_ftrace_enabled;
116 asmlinkage unsigned long 118 asmlinkage unsigned long
117 (*relocate_kernel_ptr)(unsigned long indirection_page, 119 (*relocate_kernel_ptr)(unsigned long indirection_page,
118 unsigned long control_page, 120 unsigned long control_page,
@@ -120,7 +122,12 @@ void machine_kexec(struct kimage *image)
120 unsigned int has_pae, 122 unsigned int has_pae,
121 unsigned int preserve_context); 123 unsigned int preserve_context);
122 124
123 tracer_disable(); 125#ifdef CONFIG_KEXEC_JUMP
126 if (kexec_image->preserve_context)
127 save_processor_state();
128#endif
129
130 save_ftrace_enabled = __ftrace_enabled_save();
124 131
125 /* Interrupts aren't acceptable while we reboot */ 132 /* Interrupts aren't acceptable while we reboot */
126 local_irq_disable(); 133 local_irq_disable();
@@ -138,7 +145,7 @@ void machine_kexec(struct kimage *image)
138 } 145 }
139 146
140 control_page = page_address(image->control_code_page); 147 control_page = page_address(image->control_code_page);
141 memcpy(control_page, relocate_kernel, PAGE_SIZE/2); 148 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
142 149
143 relocate_kernel_ptr = control_page; 150 relocate_kernel_ptr = control_page;
144 page_list[PA_CONTROL_PAGE] = __pa(control_page); 151 page_list[PA_CONTROL_PAGE] = __pa(control_page);
@@ -178,6 +185,13 @@ void machine_kexec(struct kimage *image)
178 (unsigned long)page_list, 185 (unsigned long)page_list,
179 image->start, cpu_has_pae, 186 image->start, cpu_has_pae,
180 image->preserve_context); 187 image->preserve_context);
188
189#ifdef CONFIG_KEXEC_JUMP
190 if (kexec_image->preserve_context)
191 restore_processor_state();
192#endif
193
194 __ftrace_enabled_restore(save_ftrace_enabled);
181} 195}
182 196
183void arch_crash_save_vmcoreinfo(void) 197void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <asm/geode.h> 34#include <asm/geode.h>
35 35
36#define MFGPT_DEFAULT_IRQ 7
37
36static struct mfgpt_timer_t { 38static struct mfgpt_timer_t {
37 unsigned int avail:1; 39 unsigned int avail:1;
38} mfgpt_timers[MFGPT_MAX_TIMERS]; 40} mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
157} 159}
158EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); 160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
159 161
160int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) 162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
161{ 163{
162 u32 val, dummy; 164 u32 zsel, lpc, dummy;
163 int offset; 165 int shift;
164 166
165 if (timer < 0 || timer >= MFGPT_MAX_TIMERS) 167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
166 return -EIO; 168 return -EIO;
167 169
168 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) 170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
169 return -EIO; 181 return -EIO;
170 182
171 rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); 183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
172 188
173 offset = (timer % 4) * 4; 189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
174 190 if (*irq < 1 || *irq == 2 || *irq > 15)
175 val &= ~((0xF << offset) | (0xF << (offset + 16))); 191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
176 195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
177 if (enable) { 199 if (enable) {
178 val |= (irq & 0x0F) << (offset); 200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
179 val |= (irq & 0x0F) << (offset + 16); 201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
180 } 202 }
181 203
182 wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
183 return 0; 204 return 0;
184} 205}
185 206
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
242static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; 263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
243static u16 mfgpt_event_clock; 264static u16 mfgpt_event_clock;
244 265
245static int irq = 7; 266static int irq;
246static int __init mfgpt_setup(char *str) 267static int __init mfgpt_setup(char *str)
247{ 268{
248 get_option(&str, &irq); 269 get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
346 mfgpt_event_clock = timer; 367 mfgpt_event_clock = timer;
347 368
348 /* Set up the IRQ on the MFGPT side */ 369 /* Set up the IRQ on the MFGPT side */
349 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 370 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
350 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); 371 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
351 return -EIO; 372 return -EIO;
352 } 373 }
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
374 &mfgpt_clockevent); 395 &mfgpt_clockevent);
375 396
376 printk(KERN_INFO 397 printk(KERN_INFO
377 "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); 398 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
399 timer, irq);
378 clockevents_register_device(&mfgpt_clockevent); 400 clockevents_register_device(&mfgpt_clockevent);
379 401
380 return 0; 402 return 0;
381 403
382err: 404err:
383 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); 405 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
384 printk(KERN_ERR 406 printk(KERN_ERR
385 "mfgpt-timer: Unable to set up the MFGPT clock source\n"); 407 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
386 return -EIO; 408 return -EIO;
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index fdfdc550b366..efc2f361fe85 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
238 {} 238 {}
239}; 239};
240 240
241void __init check_enable_amd_mmconf_dmi(void) 241void __cpuinit check_enable_amd_mmconf_dmi(void)
242{ 242{
243 dmi_check_system(mmconf_dmi_table); 243 dmi_check_system(mmconf_dmi_table);
244} 244}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e5d23675bb7c..f98f4e1dba09 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -49,7 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
49 return sum & 0xFF; 49 return sum & 0xFF;
50} 50}
51 51
52static void __cpuinit MP_processor_info(struct mpc_config_processor *m) 52static void __init MP_processor_info(struct mpc_config_processor *m)
53{ 53{
54 int apicid; 54 int apicid;
55 char *bootup_cpu = ""; 55 char *bootup_cpu = "";
@@ -486,7 +486,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
486} 486}
487 487
488 488
489static void construct_ioapic_table(int mpc_default_type) 489static void __init construct_ioapic_table(int mpc_default_type)
490{ 490{
491 struct mpc_config_ioapic ioapic; 491 struct mpc_config_ioapic ioapic;
492 struct mpc_config_bus bus; 492 struct mpc_config_bus bus;
@@ -531,7 +531,7 @@ static void construct_ioapic_table(int mpc_default_type)
531 construct_default_ioirq_mptable(mpc_default_type); 531 construct_default_ioirq_mptable(mpc_default_type);
532} 532}
533#else 533#else
534static inline void construct_ioapic_table(int mpc_default_type) { } 534static inline void __init construct_ioapic_table(int mpc_default_type) { }
535#endif 535#endif
536 536
537static inline void __init construct_default_ISA_mptable(int mpc_default_type) 537static inline void __init construct_default_ISA_mptable(int mpc_default_type)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 9fd809552447..2e2af5d18191 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf,
72 u32 data[2]; 72 u32 data[2];
73 u32 reg = *ppos; 73 u32 reg = *ppos;
74 int cpu = iminor(file->f_path.dentry->d_inode); 74 int cpu = iminor(file->f_path.dentry->d_inode);
75 int err; 75 int err = 0;
76 ssize_t bytes = 0;
76 77
77 if (count % 8) 78 if (count % 8)
78 return -EINVAL; /* Invalid chunk size */ 79 return -EINVAL; /* Invalid chunk size */
79 80
80 for (; count; count -= 8) { 81 for (; count; count -= 8) {
81 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
82 if (err) 83 if (err) {
83 return -EIO; 84 if (err == -EFAULT) /* Fix idiotic error code */
84 if (copy_to_user(tmp, &data, 8)) 85 err = -EIO;
85 return -EFAULT; 86 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT;
90 break;
91 }
86 tmp += 2; 92 tmp += 2;
93 bytes += 8;
87 } 94 }
88 95
89 return ((char __user *)tmp) - buf; 96 return bytes ? bytes : err;
90} 97}
91 98
92static ssize_t msr_write(struct file *file, const char __user *buf, 99static ssize_t msr_write(struct file *file, const char __user *buf,
@@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
96 u32 data[2]; 103 u32 data[2];
97 u32 reg = *ppos; 104 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 105 int cpu = iminor(file->f_path.dentry->d_inode);
99 int err; 106 int err = 0;
107 ssize_t bytes = 0;
100 108
101 if (count % 8) 109 if (count % 8)
102 return -EINVAL; /* Invalid chunk size */ 110 return -EINVAL; /* Invalid chunk size */
103 111
104 for (; count; count -= 8) { 112 for (; count; count -= 8) {
105 if (copy_from_user(&data, tmp, 8)) 113 if (copy_from_user(&data, tmp, 8)) {
106 return -EFAULT; 114 err = -EFAULT;
115 break;
116 }
107 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
108 if (err) 118 if (err) {
109 return -EIO; 119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break;
122 }
110 tmp += 2; 123 tmp += 2;
124 bytes += 8;
111 } 125 }
112 126
113 return ((char __user *)tmp) - buf; 127 return bytes ? bytes : err;
114} 128}
115 129
116static int msr_open(struct inode *inode, struct file *file) 130static int msr_open(struct inode *inode, struct file *file)
@@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file)
131 ret = -EIO; /* MSR not supported */ 145 ret = -EIO; /* MSR not supported */
132out: 146out:
133 unlock_kernel(); 147 unlock_kernel();
134 return 0; 148 return ret;
135} 149}
136 150
137/* 151/*
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ac6d51222e7d..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
114} 114}
115#endif 115#endif
116 116
117static void report_broken_nmi(int cpu, int *prev_nmi_count)
118{
119 printk(KERN_CONT "\n");
120
121 printk(KERN_WARNING
122 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
123 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
124
125 printk(KERN_WARNING
126 "Please report this to bugzilla.kernel.org,\n");
127 printk(KERN_WARNING
128 "and attach the output of the 'dmesg' command.\n");
129
130 per_cpu(wd_enabled, cpu) = 0;
131 atomic_dec(&nmi_active);
132}
133
117int __init check_nmi_watchdog(void) 134int __init check_nmi_watchdog(void)
118{ 135{
119 unsigned int *prev_nmi_count; 136 unsigned int *prev_nmi_count;
@@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void)
141 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
142 if (!per_cpu(wd_enabled, cpu)) 159 if (!per_cpu(wd_enabled, cpu))
143 continue; 160 continue;
144 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { 161 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
145 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 162 report_broken_nmi(cpu, prev_nmi_count);
146 "appears to be stuck (%d->%d)!\n",
147 cpu,
148 prev_nmi_count[cpu],
149 get_nmi_count(cpu));
150 per_cpu(wd_enabled, cpu) = 0;
151 atomic_dec(&nmi_active);
152 }
153 } 163 }
154 endflag = 1; 164 endflag = 1;
155 if (!atomic_read(&nmi_active)) { 165 if (!atomic_read(&nmi_active)) {
@@ -289,6 +299,15 @@ void acpi_nmi_disable(void)
289 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
290} 300}
291 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
292void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
293{ 312{
294 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -301,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
301 320
302 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
303 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
304 /* enable it before to avoid race with handler */
305 __get_cpu_var(wd_enabled) = 1;
306 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
307 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
308 return; 325 return;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 2434467ddf72..4caff39078e0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -73,7 +73,7 @@ static void __init smp_dump_qct(void)
73} 73}
74 74
75 75
76void __init numaq_tsc_disable(void) 76void __cpuinit numaq_tsc_disable(void)
77{ 77{
78 if (!found_numaq) 78 if (!found_numaq)
79 return; 79 return;
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5744789a78f4..6b0bb73998dd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -330,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = {
330#endif 330#endif
331 .wbinvd = native_wbinvd, 331 .wbinvd = native_wbinvd,
332 .read_msr = native_read_msr_safe, 332 .read_msr = native_read_msr_safe,
333 .read_msr_amd = native_read_msr_amd_safe,
333 .write_msr = native_write_msr_safe, 334 .write_msr = native_write_msr_safe,
334 .read_tsc = native_read_tsc, 335 .read_tsc = native_read_tsc,
335 .read_pmc = native_read_pmc, 336 .read_pmc = native_read_pmc,
@@ -469,7 +470,7 @@ struct pv_lock_ops pv_lock_ops = {
469 .spin_unlock = __ticket_spin_unlock, 470 .spin_unlock = __ticket_spin_unlock,
470#endif 471#endif
471}; 472};
472EXPORT_SYMBOL_GPL(pv_lock_ops); 473EXPORT_SYMBOL(pv_lock_ops);
473 474
474EXPORT_SYMBOL_GPL(pv_time_ops); 475EXPORT_SYMBOL_GPL(pv_time_ops);
475EXPORT_SYMBOL (pv_cpu_ops); 476EXPORT_SYMBOL (pv_cpu_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 02d19328525d..080d1d27f37a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -261,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
261 badbit, tbl, start_addr, npages); 261 badbit, tbl, start_addr, npages);
262 } 262 }
263 263
264 set_bit_string(tbl->it_map, index, npages); 264 iommu_area_reserve(tbl->it_map, index, npages);
265 265
266 spin_unlock_irqrestore(&tbl->it_lock, flags); 266 spin_unlock_irqrestore(&tbl->it_lock, flags);
267} 267}
@@ -343,9 +343,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
343 /* were we called with bad_dma_address? */ 343 /* were we called with bad_dma_address? */
344 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 344 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
345 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 345 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
346 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " 346 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
347 "address 0x%Lx\n", dma_addr); 347 "address 0x%Lx\n", dma_addr);
348 WARN_ON(1);
349 return; 348 return;
350 } 349 }
351 350
@@ -492,6 +491,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
492 npages = size >> PAGE_SHIFT; 491 npages = size >> PAGE_SHIFT;
493 order = get_order(size); 492 order = get_order(size);
494 493
494 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
495
495 /* alloc enough pages (and possibly more) */ 496 /* alloc enough pages (and possibly more) */
496 ret = (void *)__get_free_pages(flag, order); 497 ret = (void *)__get_free_pages(flag, order);
497 if (!ret) 498 if (!ret)
@@ -511,8 +512,22 @@ error:
511 return ret; 512 return ret;
512} 513}
513 514
515static void calgary_free_coherent(struct device *dev, size_t size,
516 void *vaddr, dma_addr_t dma_handle)
517{
518 unsigned int npages;
519 struct iommu_table *tbl = find_iommu_table(dev);
520
521 size = PAGE_ALIGN(size);
522 npages = size >> PAGE_SHIFT;
523
524 iommu_free(tbl, dma_handle, npages);
525 free_pages((unsigned long)vaddr, get_order(size));
526}
527
514static struct dma_mapping_ops calgary_dma_ops = { 528static struct dma_mapping_ops calgary_dma_ops = {
515 .alloc_coherent = calgary_alloc_coherent, 529 .alloc_coherent = calgary_alloc_coherent,
530 .free_coherent = calgary_free_coherent,
516 .map_single = calgary_map_single, 531 .map_single = calgary_map_single,
517 .unmap_single = calgary_unmap_single, 532 .unmap_single = calgary_unmap_single,
518 .map_sg = calgary_map_sg, 533 .map_sg = calgary_map_sg,
@@ -1269,13 +1284,15 @@ static inline int __init determine_tce_table_size(u64 ram)
1269static int __init build_detail_arrays(void) 1284static int __init build_detail_arrays(void)
1270{ 1285{
1271 unsigned long ptr; 1286 unsigned long ptr;
1272 int i, scal_detail_size, rio_detail_size; 1287 unsigned numnodes, i;
1288 int scal_detail_size, rio_detail_size;
1273 1289
1274 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ 1290 numnodes = rio_table_hdr->num_scal_dev;
1291 if (numnodes > MAX_NUMNODES){
1275 printk(KERN_WARNING 1292 printk(KERN_WARNING
1276 "Calgary: MAX_NUMNODES too low! Defined as %d, " 1293 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1277 "but system has %d nodes.\n", 1294 "but system has %d nodes.\n",
1278 MAX_NUMNODES, rio_table_hdr->num_scal_dev); 1295 MAX_NUMNODES, numnodes);
1279 return -ENODEV; 1296 return -ENODEV;
1280 } 1297 }
1281 1298
@@ -1296,8 +1313,7 @@ static int __init build_detail_arrays(void)
1296 } 1313 }
1297 1314
1298 ptr = ((unsigned long)rio_table_hdr) + 3; 1315 ptr = ((unsigned long)rio_table_hdr) + 3;
1299 for (i = 0; i < rio_table_hdr->num_scal_dev; 1316 for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
1300 i++, ptr += scal_detail_size)
1301 scal_devs[i] = (struct scal_detail *)ptr; 1317 scal_devs[i] = (struct scal_detail *)ptr;
1302 1318
1303 for (i = 0; i < rio_table_hdr->num_rio_dev; 1319 for (i = 0; i < rio_table_hdr->num_rio_dev;
@@ -1350,7 +1366,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1350 * Function for kdump case. Get the tce tables from first kernel 1366 * Function for kdump case. Get the tce tables from first kernel
1351 * by reading the contents of the base adress register of calgary iommu 1367 * by reading the contents of the base adress register of calgary iommu
1352 */ 1368 */
1353static void get_tce_space_from_tar(void) 1369static void __init get_tce_space_from_tar(void)
1354{ 1370{
1355 int bus; 1371 int bus;
1356 void __iomem *target; 1372 void __iomem *target;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 87d4d6964ec2..0a3824e837b4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address);
41/* Dummy device used for NULL arguments (normally ISA). Better would 41/* Dummy device used for NULL arguments (normally ISA). Better would
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 42 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 43 to older i386. */
44struct device fallback_dev = { 44struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 45 .bus_id = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 46 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &fallback_dev.coherent_dma_mask, 47 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 48};
49EXPORT_SYMBOL(x86_dma_fallback_dev);
49 50
50int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
51{ 52{
@@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void)
82 * using 512M as goal 83 * using 512M as goal
83 */ 84 */
84 align = 64ULL<<20; 85 align = 64ULL<<20;
85 size = round_up(dma32_bootmem_size, align); 86 size = roundup(dma32_bootmem_size, align);
86 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 87 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
87 512ULL<<20); 88 512ULL<<20);
88 if (dma32_bootmem_ptr) 89 if (dma32_bootmem_ptr)
@@ -133,6 +134,37 @@ unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
133EXPORT_SYMBOL(iommu_num_pages); 134EXPORT_SYMBOL(iommu_num_pages);
134#endif 135#endif
135 136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag)
139{
140 unsigned long dma_mask;
141 struct page *page;
142 dma_addr_t addr;
143
144 dma_mask = dma_alloc_coherent_mask(dev, flag);
145
146 flag |= __GFP_ZERO;
147again:
148 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
149 if (!page)
150 return NULL;
151
152 addr = page_to_phys(page);
153 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
154 __free_pages(page, get_order(size));
155
156 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
157 flag = (flag & ~GFP_DMA32) | GFP_DMA;
158 goto again;
159 }
160
161 return NULL;
162 }
163
164 *dma_addr = addr;
165 return page_address(page);
166}
167
136/* 168/*
137 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 169 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
138 * documentation. 170 * documentation.
@@ -241,147 +273,6 @@ int dma_supported(struct device *dev, u64 mask)
241} 273}
242EXPORT_SYMBOL(dma_supported); 274EXPORT_SYMBOL(dma_supported);
243 275
244/* Allocate DMA memory on node near device */
245static noinline struct page *
246dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
247{
248 int node;
249
250 node = dev_to_node(dev);
251
252 return alloc_pages_node(node, gfp, order);
253}
254
255/*
256 * Allocate memory for a coherent mapping.
257 */
258void *
259dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
260 gfp_t gfp)
261{
262 struct dma_mapping_ops *ops = get_dma_ops(dev);
263 void *memory = NULL;
264 struct page *page;
265 unsigned long dma_mask = 0;
266 dma_addr_t bus;
267 int noretry = 0;
268
269 /* ignore region specifiers */
270 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
271
272 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
273 return memory;
274
275 if (!dev) {
276 dev = &fallback_dev;
277 gfp |= GFP_DMA;
278 }
279 dma_mask = dev->coherent_dma_mask;
280 if (dma_mask == 0)
281 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
282
283 /* Device not DMA able */
284 if (dev->dma_mask == NULL)
285 return NULL;
286
287 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
288 if (gfp & __GFP_DMA)
289 noretry = 1;
290
291#ifdef CONFIG_X86_64
292 /* Why <=? Even when the mask is smaller than 4GB it is often
293 larger than 16MB and in this case we have a chance of
294 finding fitting memory in the next higher zone first. If
295 not retry with true GFP_DMA. -AK */
296 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
297 gfp |= GFP_DMA32;
298 if (dma_mask < DMA_32BIT_MASK)
299 noretry = 1;
300 }
301#endif
302
303 again:
304 page = dma_alloc_pages(dev,
305 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
306 if (page == NULL)
307 return NULL;
308
309 {
310 int high, mmu;
311 bus = page_to_phys(page);
312 memory = page_address(page);
313 high = (bus + size) >= dma_mask;
314 mmu = high;
315 if (force_iommu && !(gfp & GFP_DMA))
316 mmu = 1;
317 else if (high) {
318 free_pages((unsigned long)memory,
319 get_order(size));
320
321 /* Don't use the 16MB ZONE_DMA unless absolutely
322 needed. It's better to use remapping first. */
323 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
324 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
325 goto again;
326 }
327
328 /* Let low level make its own zone decisions */
329 gfp &= ~(GFP_DMA32|GFP_DMA);
330
331 if (ops->alloc_coherent)
332 return ops->alloc_coherent(dev, size,
333 dma_handle, gfp);
334 return NULL;
335 }
336
337 memset(memory, 0, size);
338 if (!mmu) {
339 *dma_handle = bus;
340 return memory;
341 }
342 }
343
344 if (ops->alloc_coherent) {
345 free_pages((unsigned long)memory, get_order(size));
346 gfp &= ~(GFP_DMA|GFP_DMA32);
347 return ops->alloc_coherent(dev, size, dma_handle, gfp);
348 }
349
350 if (ops->map_simple) {
351 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
352 size,
353 PCI_DMA_BIDIRECTIONAL);
354 if (*dma_handle != bad_dma_address)
355 return memory;
356 }
357
358 if (panic_on_overflow)
359 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
360 (unsigned long)size);
361 free_pages((unsigned long)memory, get_order(size));
362 return NULL;
363}
364EXPORT_SYMBOL(dma_alloc_coherent);
365
366/*
367 * Unmap coherent memory.
368 * The caller must ensure that the device has finished accessing the mapping.
369 */
370void dma_free_coherent(struct device *dev, size_t size,
371 void *vaddr, dma_addr_t bus)
372{
373 struct dma_mapping_ops *ops = get_dma_ops(dev);
374
375 int order = get_order(size);
376 WARN_ON(irqs_disabled()); /* for portability */
377 if (dma_release_from_coherent(dev, order, vaddr))
378 return;
379 if (ops->unmap_single)
380 ops->unmap_single(dev, bus, size, 0);
381 free_pages((unsigned long)vaddr, order);
382}
383EXPORT_SYMBOL(dma_free_coherent);
384
385static int __init pci_iommu_init(void) 276static int __init pci_iommu_init(void)
386{ 277{
387 calgary_iommu_init(); 278 calgary_iommu_init();
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 49285f8fd4d5..145f1c83369f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
80AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
81 81
82static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
83static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
84 84
85static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
86{ 87{
87 unsigned long offset, flags; 88 unsigned long offset, flags;
88 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
90 91
91 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
92 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
93 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
94 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
95 96
96 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
97 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
98 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
99 if (offset == -1) { 100 if (offset == -1) {
100 need_flush = 1; 101 need_flush = true;
101 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
102 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
103 } 105 }
104 if (offset != -1) { 106 if (offset != -1) {
105 next_bit = offset+size; 107 next_bit = offset+size;
106 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
107 next_bit = 0; 109 next_bit = 0;
108 need_flush = 1; 110 need_flush = true;
109 } 111 }
110 } 112 }
111 if (iommu_fullflush) 113 if (iommu_fullflush)
112 need_flush = 1; 114 need_flush = true;
113 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
114 116
115 return offset; 117 return offset;
@@ -134,7 +136,7 @@ static void flush_gart(void)
134 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
135 if (need_flush) { 137 if (need_flush) {
136 k8_flush_garts(); 138 k8_flush_garts();
137 need_flush = 0; 139 need_flush = false;
138 } 140 }
139 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
140} 142}
@@ -173,7 +175,8 @@ static void dump_leak(void)
173 iommu_leak_pages); 175 iommu_leak_pages);
174 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
175 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
176 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
177 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
178 } 181 }
179 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
212static inline int 215static inline int
213need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
214{ 217{
215 u64 mask = *dev->dma_mask; 218 return force_iommu ||
216 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
217 int mmu = high;
218
219 if (force_iommu)
220 mmu = 1;
221
222 return mmu;
223} 220}
224 221
225static inline int 222static inline int
226nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
227{ 224{
228 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
229 int high = addr + size > mask;
230 int mmu = high;
231
232 return mmu;
233} 226}
234 227
235/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
236 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
237 */ 230 */
238static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
239 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
240{ 233{
241 unsigned long npages = iommu_num_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size);
242 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
243 int i; 236 int i;
244 237
245 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
259 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
260} 253}
261 254
262static dma_addr_t
263gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
264{
265 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
266
267 flush_gart();
268
269 return map;
270}
271
272/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
273static dma_addr_t 256static dma_addr_t
274gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
276 unsigned long bus; 259 unsigned long bus;
277 260
278 if (!dev) 261 if (!dev)
279 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
280 263
281 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
282 return paddr; 265 return paddr;
283 266
284 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
285 269
286 return bus; 270 return bus;
287} 271}
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
340 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
341 325
342 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
343 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
344 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
345 if (i > 0) 329 if (i > 0)
346 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
362 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
363 unsigned long pages) 347 unsigned long pages)
364{ 348{
365 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
366 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
367 struct scatterlist *s; 351 struct scatterlist *s;
368 int i; 352 int i;
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
427 return 0; 411 return 0;
428 412
429 if (!dev) 413 if (!dev)
430 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
431 415
432 out = 0; 416 out = 0;
433 start = 0; 417 start = 0;
@@ -499,6 +483,46 @@ error:
499 return 0; 483 return 0;
500} 484}
501 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
502static int no_agp; 526static int no_agp;
503 527
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
626 struct pci_dev *dev; 650 struct pci_dev *dev;
627 void *gatt; 651 void *gatt;
628 int i, error; 652 int i, error;
629 unsigned long start_pfn, end_pfn;
630 653
631 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
632 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
650 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
651 674
652 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
653 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
654 if (!gatt) 678 if (!gatt)
655 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
656 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
657 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
658 682
659 memset(gatt, 0, gatt_size);
660 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
661 684
662 enable_gart_translations(); 685 enable_gart_translations();
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
665 if (!error) 688 if (!error)
666 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
667 if (error) 690 if (error)
668 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
669 693
670 flush_gart(); 694 flush_gart();
671 695
672 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
673 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
674 698
675 /* need to map that range */
676 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
677 if (end_pfn > max_low_pfn_mapped) {
678 start_pfn = (aper_base>>PAGE_SHIFT);
679 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
680 }
681 return 0; 699 return 0;
682 700
683 nommu: 701 nommu:
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
687 return -1; 705 return -1;
688} 706}
689 707
690extern int agp_amd64_init(void);
691
692static struct dma_mapping_ops gart_dma_ops = { 708static struct dma_mapping_ops gart_dma_ops = {
693 .map_single = gart_map_single, 709 .map_single = gart_map_single,
694 .map_simple = gart_map_simple,
695 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
696 .sync_single_for_cpu = NULL,
697 .sync_single_for_device = NULL,
698 .sync_single_range_for_cpu = NULL,
699 .sync_single_range_for_device = NULL,
700 .sync_sg_for_cpu = NULL,
701 .sync_sg_for_device = NULL,
702 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
703 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
704}; 715};
705 716
706void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void)
727{ 738{
728 struct agp_kern_info info; 739 struct agp_kern_info info;
729 unsigned long iommu_start; 740 unsigned long iommu_start;
730 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
731 unsigned long scratch; 743 unsigned long scratch;
732 long i; 744 long i;
733 745
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void)
759 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
760 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
761 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
762 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
763 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
764 } 776 }
765 return; 777 return;
766 } 778 }
767 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
768 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
769 aper_size = info.aper_size * 1024 * 1024;
770 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
771 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
772 792
773 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
774 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
775 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
776 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
777 memset(iommu_gart_bitmap, 0, iommu_pages/8);
778 797
779#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
780 if (leak_trace) { 799 if (leak_trace) {
781 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
782 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
783 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
784 memset(iommu_leak_tab, 0, iommu_pages * 8);
785 else
786 printk(KERN_DEBUG 803 printk(KERN_DEBUG
787 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
788 } 805 }
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void)
792 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
793 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
794 */ 811 */
795 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
796 813
797 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
798 printk(KERN_INFO 815 printk(KERN_INFO
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p)
850 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
851 leak_trace = 1; 868 leak_trace = 1;
852 p += 4; 869 p += 4;
853 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
854 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
855 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
856 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3f91f71cdc3e..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76 dma_addr_t dma_addr)
77{
78 free_pages((unsigned long)vaddr, get_order(size));
79}
80
75struct dma_mapping_ops nommu_dma_ops = { 81struct dma_mapping_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
76 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
77 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
78 .is_phys = 1, 86 .is_phys = 1,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc4d5b0a6a0..ec7a2ba9bce8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -185,7 +185,8 @@ static void mwait_idle(void)
185static void poll_idle(void) 185static void poll_idle(void)
186{ 186{
187 local_irq_enable(); 187 local_irq_enable();
188 cpu_relax(); 188 while (!need_resched())
189 cpu_relax();
189} 190}
190 191
191/* 192/*
@@ -246,6 +247,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
246 return 1; 247 return 1;
247} 248}
248 249
250static cpumask_t c1e_mask = CPU_MASK_NONE;
251static int c1e_detected;
252
253void c1e_remove_cpu(int cpu)
254{
255 cpu_clear(cpu, c1e_mask);
256}
257
249/* 258/*
250 * C1E aware idle routine. We check for C1E active in the interrupt 259 * C1E aware idle routine. We check for C1E active in the interrupt
251 * pending message MSR. If we detect C1E, then we handle it the same 260 * pending message MSR. If we detect C1E, then we handle it the same
@@ -253,9 +262,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
253 */ 262 */
254static void c1e_idle(void) 263static void c1e_idle(void)
255{ 264{
256 static cpumask_t c1e_mask = CPU_MASK_NONE;
257 static int c1e_detected;
258
259 if (need_resched()) 265 if (need_resched())
260 return; 266 return;
261 267
@@ -265,8 +271,10 @@ static void c1e_idle(void)
265 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 271 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
266 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 272 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
267 c1e_detected = 1; 273 c1e_detected = 1;
268 mark_tsc_unstable("TSC halt in C1E"); 274 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
269 printk(KERN_INFO "System has C1E enabled\n"); 275 mark_tsc_unstable("TSC halt in AMD C1E");
276 printk(KERN_INFO "System has AMD C1E enabled\n");
277 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
270 } 278 }
271 } 279 }
272 280
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7b6e44a7c624..205188db9626 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,7 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
58#include <asm/syscalls.h> 60#include <asm/syscalls.h>
59#include <asm/smp.h> 61#include <asm/smp.h>
60 62
@@ -90,6 +92,7 @@ static void cpu_exit_clear(void)
90 cpu_clear(cpu, cpu_callin_map); 92 cpu_clear(cpu, cpu_callin_map);
91 93
92 numa_remove_cpu(cpu); 94 numa_remove_cpu(cpu);
95 c1e_remove_cpu(cpu);
93} 96}
94 97
95/* We don't actually take CPU down, just spin without interrupts. */ 98/* We don't actually take CPU down, just spin without interrupts. */
@@ -97,7 +100,6 @@ static inline void play_dead(void)
97{ 100{
98 /* This must be done before dead CPU ack */ 101 /* This must be done before dead CPU ack */
99 cpu_exit_clear(); 102 cpu_exit_clear();
100 wbinvd();
101 mb(); 103 mb();
102 /* Ack it */ 104 /* Ack it */
103 __get_cpu_var(cpu_state) = CPU_DEAD; 105 __get_cpu_var(cpu_state) = CPU_DEAD;
@@ -106,8 +108,8 @@ static inline void play_dead(void)
106 * With physical CPU hotplug, we should halt the cpu 108 * With physical CPU hotplug, we should halt the cpu
107 */ 109 */
108 local_irq_disable(); 110 local_irq_disable();
109 while (1) 111 /* mask all interrupts, flush any and all caches, and halt */
110 halt(); 112 wbinvd_halt();
111} 113}
112#else 114#else
113static inline void play_dead(void) 115static inline void play_dead(void)
@@ -162,6 +164,7 @@ void __show_registers(struct pt_regs *regs, int all)
162 unsigned long d0, d1, d2, d3, d6, d7; 164 unsigned long d0, d1, d2, d3, d6, d7;
163 unsigned long sp; 165 unsigned long sp;
164 unsigned short ss, gs; 166 unsigned short ss, gs;
167 const char *board;
165 168
166 if (user_mode_vm(regs)) { 169 if (user_mode_vm(regs)) {
167 sp = regs->sp; 170 sp = regs->sp;
@@ -174,11 +177,15 @@ void __show_registers(struct pt_regs *regs, int all)
174 } 177 }
175 178
176 printk("\n"); 179 printk("\n");
177 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 180
181 board = dmi_get_system_info(DMI_PRODUCT_NAME);
182 if (!board)
183 board = "";
184 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
178 task_pid_nr(current), current->comm, 185 task_pid_nr(current), current->comm,
179 print_tainted(), init_utsname()->release, 186 print_tainted(), init_utsname()->release,
180 (int)strcspn(init_utsname()->version, " "), 187 (int)strcspn(init_utsname()->version, " "),
181 init_utsname()->version); 188 init_utsname()->version, board);
182 189
183 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 190 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
184 (u16)regs->cs, regs->ip, regs->flags, 191 (u16)regs->cs, regs->ip, regs->flags,
@@ -278,6 +285,14 @@ void exit_thread(void)
278 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 285 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
279 put_cpu(); 286 put_cpu();
280 } 287 }
288#ifdef CONFIG_X86_DS
289 /* Free any DS contexts that have not been properly released. */
290 if (unlikely(current->thread.ds_ctx)) {
291 /* we clear debugctl to make sure DS is not used. */
292 update_debugctlmsr(0);
293 ds_free(current->thread.ds_ctx);
294 }
295#endif /* CONFIG_X86_DS */
281} 296}
282 297
283void flush_thread(void) 298void flush_thread(void)
@@ -439,6 +454,35 @@ int set_tsc_mode(unsigned int val)
439 return 0; 454 return 0;
440} 455}
441 456
457#ifdef CONFIG_X86_DS
458static int update_debugctl(struct thread_struct *prev,
459 struct thread_struct *next, unsigned long debugctl)
460{
461 unsigned long ds_prev = 0;
462 unsigned long ds_next = 0;
463
464 if (prev->ds_ctx)
465 ds_prev = (unsigned long)prev->ds_ctx->ds;
466 if (next->ds_ctx)
467 ds_next = (unsigned long)next->ds_ctx->ds;
468
469 if (ds_next != ds_prev) {
470 /* we clear debugctl to make sure DS
471 * is not in use when we change it */
472 debugctl = 0;
473 update_debugctlmsr(0);
474 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
475 }
476 return debugctl;
477}
478#else
479static int update_debugctl(struct thread_struct *prev,
480 struct thread_struct *next, unsigned long debugctl)
481{
482 return debugctl;
483}
484#endif /* CONFIG_X86_DS */
485
442static noinline void 486static noinline void
443__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 487__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
444 struct tss_struct *tss) 488 struct tss_struct *tss)
@@ -449,14 +493,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
449 prev = &prev_p->thread; 493 prev = &prev_p->thread;
450 next = &next_p->thread; 494 next = &next_p->thread;
451 495
452 debugctl = prev->debugctlmsr; 496 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
453 if (next->ds_area_msr != prev->ds_area_msr) {
454 /* we clear debugctl to make sure DS
455 * is not in use when we change it */
456 debugctl = 0;
457 update_debugctlmsr(0);
458 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
459 }
460 497
461 if (next->debugctlmsr != debugctl) 498 if (next->debugctlmsr != debugctl)
462 update_debugctlmsr(next->debugctlmsr); 499 update_debugctlmsr(next->debugctlmsr);
@@ -480,13 +517,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
480 hard_enable_TSC(); 517 hard_enable_TSC();
481 } 518 }
482 519
483#ifdef X86_BTS 520#ifdef CONFIG_X86_PTRACE_BTS
484 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 521 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
485 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 522 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
486 523
487 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 524 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
488 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 525 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
489#endif 526#endif /* CONFIG_X86_PTRACE_BTS */
490 527
491 528
492 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 529 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 87d7dfdcf46c..2a8ccb9238b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h>
41#include <linux/io.h>
40 42
41#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/i387.h> 46#include <asm/i387.h>
47#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
@@ -89,19 +89,20 @@ void exit_idle(void)
89#ifdef CONFIG_HOTPLUG_CPU 89#ifdef CONFIG_HOTPLUG_CPU
90DECLARE_PER_CPU(int, cpu_state); 90DECLARE_PER_CPU(int, cpu_state);
91 91
92#include <asm/nmi.h> 92#include <linux/nmi.h>
93/* We halt the CPU with physical CPU hotplug */ 93/* We halt the CPU with physical CPU hotplug */
94static inline void play_dead(void) 94static inline void play_dead(void)
95{ 95{
96 idle_task_exit(); 96 idle_task_exit();
97 wbinvd(); 97 c1e_remove_cpu(raw_smp_processor_id());
98
98 mb(); 99 mb();
99 /* Ack it */ 100 /* Ack it */
100 __get_cpu_var(cpu_state) = CPU_DEAD; 101 __get_cpu_var(cpu_state) = CPU_DEAD;
101 102
102 local_irq_disable(); 103 local_irq_disable();
103 while (1) 104 /* mask all interrupts, flush any and all caches, and halt */
104 halt(); 105 wbinvd_halt();
105} 106}
106#else 107#else
107static inline void play_dead(void) 108static inline void play_dead(void)
@@ -153,7 +154,7 @@ void cpu_idle(void)
153} 154}
154 155
155/* Prints also some state that isn't saved in the pt_regs */ 156/* Prints also some state that isn't saved in the pt_regs */
156void __show_regs(struct pt_regs * regs) 157void __show_regs(struct pt_regs *regs)
157{ 158{
158 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 159 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
159 unsigned long d0, d1, d2, d3, d6, d7; 160 unsigned long d0, d1, d2, d3, d6, d7;
@@ -162,59 +163,61 @@ void __show_regs(struct pt_regs * regs)
162 163
163 printk("\n"); 164 printk("\n");
164 print_modules(); 165 print_modules();
165 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 166 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
166 current->pid, current->comm, print_tainted(), 167 current->pid, current->comm, print_tainted(),
167 init_utsname()->release, 168 init_utsname()->release,
168 (int)strcspn(init_utsname()->version, " "), 169 (int)strcspn(init_utsname()->version, " "),
169 init_utsname()->version); 170 init_utsname()->version);
170 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 171 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
171 printk_address(regs->ip, 1); 172 printk_address(regs->ip, 1);
172 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 173 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
173 regs->flags); 174 regs->sp, regs->flags);
174 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 175 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
175 regs->ax, regs->bx, regs->cx); 176 regs->ax, regs->bx, regs->cx);
176 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 177 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
177 regs->dx, regs->si, regs->di); 178 regs->dx, regs->si, regs->di);
178 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 179 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
179 regs->bp, regs->r8, regs->r9); 180 regs->bp, regs->r8, regs->r9);
180 printk("R10: %016lx R11: %016lx R12: %016lx\n", 181 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
181 regs->r10, regs->r11, regs->r12); 182 regs->r10, regs->r11, regs->r12);
182 printk("R13: %016lx R14: %016lx R15: %016lx\n", 183 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
183 regs->r13, regs->r14, regs->r15); 184 regs->r13, regs->r14, regs->r15);
184 185
185 asm("movl %%ds,%0" : "=r" (ds)); 186 asm("movl %%ds,%0" : "=r" (ds));
186 asm("movl %%cs,%0" : "=r" (cs)); 187 asm("movl %%cs,%0" : "=r" (cs));
187 asm("movl %%es,%0" : "=r" (es)); 188 asm("movl %%es,%0" : "=r" (es));
188 asm("movl %%fs,%0" : "=r" (fsindex)); 189 asm("movl %%fs,%0" : "=r" (fsindex));
189 asm("movl %%gs,%0" : "=r" (gsindex)); 190 asm("movl %%gs,%0" : "=r" (gsindex));
190 191
191 rdmsrl(MSR_FS_BASE, fs); 192 rdmsrl(MSR_FS_BASE, fs);
192 rdmsrl(MSR_GS_BASE, gs); 193 rdmsrl(MSR_GS_BASE, gs);
193 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 194 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
194 195
195 cr0 = read_cr0(); 196 cr0 = read_cr0();
196 cr2 = read_cr2(); 197 cr2 = read_cr2();
197 cr3 = read_cr3(); 198 cr3 = read_cr3();
198 cr4 = read_cr4(); 199 cr4 = read_cr4();
199 200
200 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 201 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
201 fs,fsindex,gs,gsindex,shadowgs); 202 fs, fsindex, gs, gsindex, shadowgs);
202 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 203 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
203 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 204 es, cr0);
205 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
206 cr4);
204 207
205 get_debugreg(d0, 0); 208 get_debugreg(d0, 0);
206 get_debugreg(d1, 1); 209 get_debugreg(d1, 1);
207 get_debugreg(d2, 2); 210 get_debugreg(d2, 2);
208 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 211 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
209 get_debugreg(d3, 3); 212 get_debugreg(d3, 3);
210 get_debugreg(d6, 6); 213 get_debugreg(d6, 6);
211 get_debugreg(d7, 7); 214 get_debugreg(d7, 7);
212 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 215 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
213} 216}
214 217
215void show_regs(struct pt_regs *regs) 218void show_regs(struct pt_regs *regs)
216{ 219{
217 printk("CPU %d:", smp_processor_id()); 220 printk(KERN_INFO "CPU %d:", smp_processor_id());
218 __show_regs(regs); 221 __show_regs(regs);
219 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 222 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
220} 223}
@@ -240,6 +243,14 @@ void exit_thread(void)
240 t->io_bitmap_max = 0; 243 t->io_bitmap_max = 0;
241 put_cpu(); 244 put_cpu();
242 } 245 }
246#ifdef CONFIG_X86_DS
247 /* Free any DS contexts that have not been properly released. */
248 if (unlikely(t->ds_ctx)) {
249 /* we clear debugctl to make sure DS is not used. */
250 update_debugctlmsr(0);
251 ds_free(t->ds_ctx);
252 }
253#endif /* CONFIG_X86_DS */
243} 254}
244 255
245void flush_thread(void) 256void flush_thread(void)
@@ -315,10 +326,10 @@ void prepare_to_copy(struct task_struct *tsk)
315 326
316int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 327int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
317 unsigned long unused, 328 unsigned long unused,
318 struct task_struct * p, struct pt_regs * regs) 329 struct task_struct *p, struct pt_regs *regs)
319{ 330{
320 int err; 331 int err;
321 struct pt_regs * childregs; 332 struct pt_regs *childregs;
322 struct task_struct *me = current; 333 struct task_struct *me = current;
323 334
324 childregs = ((struct pt_regs *) 335 childregs = ((struct pt_regs *)
@@ -363,10 +374,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
363 if (test_thread_flag(TIF_IA32)) 374 if (test_thread_flag(TIF_IA32))
364 err = do_set_thread_area(p, -1, 375 err = do_set_thread_area(p, -1,
365 (struct user_desc __user *)childregs->si, 0); 376 (struct user_desc __user *)childregs->si, 0);
366 else 377 else
367#endif 378#endif
368 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 379 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
369 if (err) 380 if (err)
370 goto out; 381 goto out;
371 } 382 }
372 err = 0; 383 err = 0;
@@ -473,13 +484,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
473 next = &next_p->thread; 484 next = &next_p->thread;
474 485
475 debugctl = prev->debugctlmsr; 486 debugctl = prev->debugctlmsr;
476 if (next->ds_area_msr != prev->ds_area_msr) { 487
477 /* we clear debugctl to make sure DS 488#ifdef CONFIG_X86_DS
478 * is not in use when we change it */ 489 {
479 debugctl = 0; 490 unsigned long ds_prev = 0, ds_next = 0;
480 update_debugctlmsr(0); 491
481 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 492 if (prev->ds_ctx)
493 ds_prev = (unsigned long)prev->ds_ctx->ds;
494 if (next->ds_ctx)
495 ds_next = (unsigned long)next->ds_ctx->ds;
496
497 if (ds_next != ds_prev) {
498 /*
499 * We clear debugctl to make sure DS
500 * is not in use when we change it:
501 */
502 debugctl = 0;
503 update_debugctlmsr(0);
504 wrmsrl(MSR_IA32_DS_AREA, ds_next);
505 }
482 } 506 }
507#endif /* CONFIG_X86_DS */
483 508
484 if (next->debugctlmsr != debugctl) 509 if (next->debugctlmsr != debugctl)
485 update_debugctlmsr(next->debugctlmsr); 510 update_debugctlmsr(next->debugctlmsr);
@@ -517,13 +542,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
517 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 542 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
518 } 543 }
519 544
520#ifdef X86_BTS 545#ifdef CONFIG_X86_PTRACE_BTS
521 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 546 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
522 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 547 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
523 548
524 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 549 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
525 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 550 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
526#endif 551#endif /* CONFIG_X86_PTRACE_BTS */
527} 552}
528 553
529/* 554/*
@@ -545,7 +570,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
545 unsigned fsindex, gsindex; 570 unsigned fsindex, gsindex;
546 571
547 /* we're going to use this soon, after a few expensive things */ 572 /* we're going to use this soon, after a few expensive things */
548 if (next_p->fpu_counter>5) 573 if (next_p->fpu_counter > 5)
549 prefetch(next->xstate); 574 prefetch(next->xstate);
550 575
551 /* 576 /*
@@ -553,13 +578,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
553 */ 578 */
554 load_sp0(tss, next); 579 load_sp0(tss, next);
555 580
556 /* 581 /*
557 * Switch DS and ES. 582 * Switch DS and ES.
558 * This won't pick up thread selector changes, but I guess that is ok. 583 * This won't pick up thread selector changes, but I guess that is ok.
559 */ 584 */
560 savesegment(es, prev->es); 585 savesegment(es, prev->es);
561 if (unlikely(next->es | prev->es)) 586 if (unlikely(next->es | prev->es))
562 loadsegment(es, next->es); 587 loadsegment(es, next->es);
563 588
564 savesegment(ds, prev->ds); 589 savesegment(ds, prev->ds);
565 if (unlikely(next->ds | prev->ds)) 590 if (unlikely(next->ds | prev->ds))
@@ -585,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
585 */ 610 */
586 arch_leave_lazy_cpu_mode(); 611 arch_leave_lazy_cpu_mode();
587 612
588 /* 613 /*
589 * Switch FS and GS. 614 * Switch FS and GS.
590 * 615 *
591 * Segment register != 0 always requires a reload. Also 616 * Segment register != 0 always requires a reload. Also
@@ -594,13 +619,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
594 */ 619 */
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 620 if (unlikely(fsindex | next->fsindex | prev->fs)) {
596 loadsegment(fs, next->fsindex); 621 loadsegment(fs, next->fsindex);
597 /* 622 /*
598 * Check if the user used a selector != 0; if yes 623 * Check if the user used a selector != 0; if yes
599 * clear 64bit base, since overloaded base is always 624 * clear 64bit base, since overloaded base is always
600 * mapped to the Null selector 625 * mapped to the Null selector
601 */ 626 */
602 if (fsindex) 627 if (fsindex)
603 prev->fs = 0; 628 prev->fs = 0;
604 } 629 }
605 /* when next process has a 64bit base use it */ 630 /* when next process has a 64bit base use it */
606 if (next->fs) 631 if (next->fs)
@@ -610,7 +635,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
610 if (unlikely(gsindex | next->gsindex | prev->gs)) { 635 if (unlikely(gsindex | next->gsindex | prev->gs)) {
611 load_gs_index(next->gsindex); 636 load_gs_index(next->gsindex);
612 if (gsindex) 637 if (gsindex)
613 prev->gs = 0; 638 prev->gs = 0;
614 } 639 }
615 if (next->gs) 640 if (next->gs)
616 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 641 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -619,12 +644,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
619 /* Must be after DS reload */ 644 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 645 unlazy_fpu(prev_p);
621 646
622 /* 647 /*
623 * Switch the PDA and FPU contexts. 648 * Switch the PDA and FPU contexts.
624 */ 649 */
625 prev->usersp = read_pda(oldrsp); 650 prev->usersp = read_pda(oldrsp);
626 write_pda(oldrsp, next->usersp); 651 write_pda(oldrsp, next->usersp);
627 write_pda(pcurrent, next_p); 652 write_pda(pcurrent, next_p);
628 653
629 write_pda(kernelstack, 654 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + 655 (unsigned long)task_stack_page(next_p) +
@@ -665,7 +690,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
665 char __user * __user *envp, struct pt_regs *regs) 690 char __user * __user *envp, struct pt_regs *regs)
666{ 691{
667 long error; 692 long error;
668 char * filename; 693 char *filename;
669 694
670 filename = getname(name); 695 filename = getname(name);
671 error = PTR_ERR(filename); 696 error = PTR_ERR(filename);
@@ -723,55 +748,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
723unsigned long get_wchan(struct task_struct *p) 748unsigned long get_wchan(struct task_struct *p)
724{ 749{
725 unsigned long stack; 750 unsigned long stack;
726 u64 fp,ip; 751 u64 fp, ip;
727 int count = 0; 752 int count = 0;
728 753
729 if (!p || p == current || p->state==TASK_RUNNING) 754 if (!p || p == current || p->state == TASK_RUNNING)
730 return 0; 755 return 0;
731 stack = (unsigned long)task_stack_page(p); 756 stack = (unsigned long)task_stack_page(p);
732 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 757 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
733 return 0; 758 return 0;
734 fp = *(u64 *)(p->thread.sp); 759 fp = *(u64 *)(p->thread.sp);
735 do { 760 do {
736 if (fp < (unsigned long)stack || 761 if (fp < (unsigned long)stack ||
737 fp > (unsigned long)stack+THREAD_SIZE) 762 fp > (unsigned long)stack+THREAD_SIZE)
738 return 0; 763 return 0;
739 ip = *(u64 *)(fp+8); 764 ip = *(u64 *)(fp+8);
740 if (!in_sched_functions(ip)) 765 if (!in_sched_functions(ip))
741 return ip; 766 return ip;
742 fp = *(u64 *)fp; 767 fp = *(u64 *)fp;
743 } while (count++ < 16); 768 } while (count++ < 16);
744 return 0; 769 return 0;
745} 770}
746 771
747long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 772long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
748{ 773{
749 int ret = 0; 774 int ret = 0;
750 int doit = task == current; 775 int doit = task == current;
751 int cpu; 776 int cpu;
752 777
753 switch (code) { 778 switch (code) {
754 case ARCH_SET_GS: 779 case ARCH_SET_GS:
755 if (addr >= TASK_SIZE_OF(task)) 780 if (addr >= TASK_SIZE_OF(task))
756 return -EPERM; 781 return -EPERM;
757 cpu = get_cpu(); 782 cpu = get_cpu();
758 /* handle small bases via the GDT because that's faster to 783 /* handle small bases via the GDT because that's faster to
759 switch. */ 784 switch. */
760 if (addr <= 0xffffffff) { 785 if (addr <= 0xffffffff) {
761 set_32bit_tls(task, GS_TLS, addr); 786 set_32bit_tls(task, GS_TLS, addr);
762 if (doit) { 787 if (doit) {
763 load_TLS(&task->thread, cpu); 788 load_TLS(&task->thread, cpu);
764 load_gs_index(GS_TLS_SEL); 789 load_gs_index(GS_TLS_SEL);
765 } 790 }
766 task->thread.gsindex = GS_TLS_SEL; 791 task->thread.gsindex = GS_TLS_SEL;
767 task->thread.gs = 0; 792 task->thread.gs = 0;
768 } else { 793 } else {
769 task->thread.gsindex = 0; 794 task->thread.gsindex = 0;
770 task->thread.gs = addr; 795 task->thread.gs = addr;
771 if (doit) { 796 if (doit) {
772 load_gs_index(0); 797 load_gs_index(0);
773 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 798 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
774 } 799 }
775 } 800 }
776 put_cpu(); 801 put_cpu();
777 break; 802 break;
@@ -825,8 +850,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
825 rdmsrl(MSR_KERNEL_GS_BASE, base); 850 rdmsrl(MSR_KERNEL_GS_BASE, base);
826 else 851 else
827 base = task->thread.gs; 852 base = task->thread.gs;
828 } 853 } else
829 else
830 base = task->thread.gs; 854 base = task->thread.gs;
831 ret = put_user(base, (unsigned long __user *)addr); 855 ret = put_user(base, (unsigned long __user *)addr);
832 break; 856 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fc3e8dcd9da6..e375b658efc3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -554,45 +555,115 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 555 return 0;
555} 556}
556 557
557#ifdef X86_BTS 558#ifdef CONFIG_X86_PTRACE_BTS
559/*
560 * The configuration for a particular BTS hardware implementation.
561 */
562struct bts_configuration {
563 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
564 unsigned char sizeof_bts;
565 /* the size of a field in the BTS record in bytes */
566 unsigned char sizeof_field;
567 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
568 unsigned long debugctl_mask;
569};
570static struct bts_configuration bts_cfg;
571
572#define BTS_MAX_RECORD_SIZE (8 * 3)
573
574
575/*
576 * Branch Trace Store (BTS) uses the following format. Different
577 * architectures vary in the size of those fields.
578 * - source linear address
579 * - destination linear address
580 * - flags
581 *
582 * Later architectures use 64bit pointers throughout, whereas earlier
583 * architectures use 32bit pointers in 32bit mode.
584 *
585 * We compute the base address for the first 8 fields based on:
586 * - the field size stored in the DS configuration
587 * - the relative field position
588 *
589 * In order to store additional information in the BTS buffer, we use
590 * a special source address to indicate that the record requires
591 * special interpretation.
592 *
593 * Netburst indicated via a bit in the flags field whether the branch
594 * was predicted; this is ignored.
595 */
596
597enum bts_field {
598 bts_from = 0,
599 bts_to,
600 bts_flags,
601
602 bts_escape = (unsigned long)-1,
603 bts_qual = bts_to,
604 bts_jiffies = bts_flags
605};
606
607static inline unsigned long bts_get(const char *base, enum bts_field field)
608{
609 base += (bts_cfg.sizeof_field * field);
610 return *(unsigned long *)base;
611}
558 612
559static int ptrace_bts_get_size(struct task_struct *child) 613static inline void bts_set(char *base, enum bts_field field, unsigned long val)
560{ 614{
561 if (!child->thread.ds_area_msr) 615 base += (bts_cfg.sizeof_field * field);;
562 return -ENXIO; 616 (*(unsigned long *)base) = val;
617}
563 618
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 619/*
620 * Translate a BTS record from the raw format into the bts_struct format
621 *
622 * out (out): bts_struct interpretation
623 * raw: raw BTS record
624 */
625static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
626{
627 memset(out, 0, sizeof(*out));
628 if (bts_get(raw, bts_from) == bts_escape) {
629 out->qualifier = bts_get(raw, bts_qual);
630 out->variant.jiffies = bts_get(raw, bts_jiffies);
631 } else {
632 out->qualifier = BTS_BRANCH;
633 out->variant.lbr.from_ip = bts_get(raw, bts_from);
634 out->variant.lbr.to_ip = bts_get(raw, bts_to);
635 }
565} 636}
566 637
567static int ptrace_bts_read_record(struct task_struct *child, 638static int ptrace_bts_read_record(struct task_struct *child, size_t index,
568 long index,
569 struct bts_struct __user *out) 639 struct bts_struct __user *out)
570{ 640{
571 struct bts_struct ret; 641 struct bts_struct ret;
572 int retval; 642 const void *bts_record;
573 int bts_end; 643 size_t bts_index, bts_end;
574 int bts_index; 644 int error;
575
576 if (!child->thread.ds_area_msr)
577 return -ENXIO;
578 645
579 if (index < 0) 646 error = ds_get_bts_end(child, &bts_end);
580 return -EINVAL; 647 if (error < 0)
648 return error;
581 649
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 650 if (bts_end <= index)
584 return -EINVAL; 651 return -EINVAL;
585 652
653 error = ds_get_bts_index(child, &bts_index);
654 if (error < 0)
655 return error;
656
586 /* translate the ptrace bts index into the ds bts index */ 657 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 658 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 659 if (bts_end <= bts_index)
589 if (bts_index < 0) 660 bts_index -= bts_end;
590 bts_index += bts_end;
591 661
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 662 error = ds_access_bts(child, bts_index, &bts_record);
593 bts_index, &ret); 663 if (error < 0)
594 if (retval < 0) 664 return error;
595 return retval; 665
666 ptrace_bts_translate_record(&ret, bts_record);
596 667
597 if (copy_to_user(out, &ret, sizeof(ret))) 668 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 669 return -EFAULT;
@@ -600,101 +671,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 671 return sizeof(ret);
601} 672}
602 673
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 674static int ptrace_bts_drain(struct task_struct *child,
612 long size, 675 long size,
613 struct bts_struct __user *out) 676 struct bts_struct __user *out)
614{ 677{
615 int end, i; 678 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 679 const unsigned char *raw;
617 680 size_t end, i;
618 if (!ds) 681 int error;
619 return -ENXIO;
620 682
621 end = ds_get_bts_index(ds); 683 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 684 if (error < 0)
623 return end; 685 return error;
624 686
625 if (size < (end * sizeof(struct bts_struct))) 687 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 688 return -EIO;
627 689
628 for (i = 0; i < end; i++, out++) { 690 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 691 if (error < 0)
630 int retval; 692 return error;
631 693
632 retval = ds_read_bts(ds, i, &ret); 694 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 695 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 696
636 if (copy_to_user(out, &ret, sizeof(ret))) 697 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 698 return -EFAULT;
638 } 699 }
639 700
640 ds_clear(ds); 701 error = ds_clear_bts(child);
702 if (error < 0)
703 return error;
641 704
642 return end; 705 return end;
643} 706}
644 707
708static void ptrace_bts_ovfl(struct task_struct *child)
709{
710 send_sig(child->thread.bts_ovfl_signal, child, 0);
711}
712
645static int ptrace_bts_config(struct task_struct *child, 713static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 714 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 715 const struct ptrace_bts_config __user *ucfg)
648{ 716{
649 struct ptrace_bts_config cfg; 717 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 718 int error = 0;
651 void *ds; 719
720 error = -EOPNOTSUPP;
721 if (!bts_cfg.sizeof_bts)
722 goto errout;
652 723
724 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 725 if (cfg_size < sizeof(cfg))
654 return -EIO; 726 goto errout;
655 727
728 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 729 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 730 goto errout;
658 731
659 if ((int)cfg.size < 0) 732 error = -EINVAL;
660 return -EINVAL; 733 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
734 !(cfg.flags & PTRACE_BTS_O_ALLOC))
735 goto errout;
661 736
662 bts_size = 0; 737 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 738 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 739 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds); 740
666 if (bts_size < 0) 741 /* we ignore the error in case we were not tracing child */
667 return bts_size; 742 (void)ds_release_bts(child);
668 }
669 cfg.size = PAGE_ALIGN(cfg.size);
670 743
671 if (bts_size != cfg.size) { 744 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
672 ret = ptrace_bts_realloc(child, cfg.size, 745 if (!cfg.signal)
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE); 746 goto errout;
674 if (ret < 0) 747
748 sig = cfg.signal;
749 ovfl = ptrace_bts_ovfl;
750 }
751
752 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
753 if (error < 0)
675 goto errout; 754 goto errout;
676 755
677 ds = (void *)child->thread.ds_area_msr; 756 child->thread.bts_ovfl_signal = sig;
678 } 757 }
679 758
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 759 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 760 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 761 goto errout;
686 762
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 763 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 764 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 765 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 766 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 767
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 768 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 769 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 770 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 771 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 772
697 ret = sizeof(cfg); 773 error = sizeof(cfg);
698 774
699out: 775out:
700 if (child->thread.debugctlmsr) 776 if (child->thread.debugctlmsr)
@@ -702,10 +778,10 @@ out:
702 else 778 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 779 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 780
705 return ret; 781 return error;
706 782
707errout: 783errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 784 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 785 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 786 goto out;
711} 787}
@@ -714,29 +790,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 790 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 791 struct ptrace_bts_config __user *ucfg)
716{ 792{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 793 struct ptrace_bts_config cfg;
794 size_t end;
795 const void *base, *max;
796 int error;
719 797
720 if (cfg_size < sizeof(cfg)) 798 if (cfg_size < sizeof(cfg))
721 return -EIO; 799 return -EIO;
722 800
723 memset(&cfg, 0, sizeof(cfg)); 801 error = ds_get_bts_end(child, &end);
802 if (error < 0)
803 return error;
724 804
725 if (ds) { 805 error = ds_access_bts(child, /* index = */ 0, &base);
726 cfg.size = ds_get_bts_size(ds); 806 if (error < 0)
807 return error;
727 808
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 809 error = ds_access_bts(child, /* index = */ end, &max);
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 810 if (error < 0)
811 return error;
730 812
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 813 memset(&cfg, 0, sizeof(cfg));
732 child->thread.debugctlmsr & ds_debugctl_mask()) 814 cfg.size = (max - base);
733 cfg.flags |= PTRACE_BTS_O_TRACE; 815 cfg.signal = child->thread.bts_ovfl_signal;
816 cfg.bts_size = sizeof(struct bts_struct);
734 817
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 818 if (cfg.signal)
736 cfg.flags |= PTRACE_BTS_O_SCHED; 819 cfg.flags |= PTRACE_BTS_O_SIGNAL;
737 }
738 820
739 cfg.bts_size = sizeof(struct bts_struct); 821 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
822 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
823 cfg.flags |= PTRACE_BTS_O_TRACE;
824
825 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
826 cfg.flags |= PTRACE_BTS_O_SCHED;
740 827
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 828 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 829 return -EFAULT;
@@ -744,89 +831,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 831 return sizeof(cfg);
745} 832}
746 833
747
748static int ptrace_bts_write_record(struct task_struct *child, 834static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 835 const struct bts_struct *in)
750{ 836{
751 int retval; 837 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 838
753 if (!child->thread.ds_area_msr) 839 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 840
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 841 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 842 switch (in->qualifier) {
758 return retval; 843 case BTS_INVALID:
844 break;
759 845
760 return sizeof(*in); 846 case BTS_BRANCH:
761} 847 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
848 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
849 break;
762 850
763static int ptrace_bts_realloc(struct task_struct *child, 851 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 852 case BTS_TASK_DEPARTS:
765{ 853 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 854 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 855 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
856 break;
768 857
769 if (size < 0) 858 default:
770 return -EINVAL; 859 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 }
801
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
803 vm = current->mm->locked_vm + size;
804 if (rlim < vm) {
805 ret = -ENOMEM;
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 } 860 }
814 861
815 ret = ds_allocate((void **)&child->thread.ds_area_msr, 862 /* The writing task will be the switched-to task on a context
816 size << PAGE_SHIFT); 863 * switch. It needs to write into the switched-from task's BTS
817 if (ret < 0) 864 * buffer. */
818 goto out; 865 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 866}
831 867
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 868void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +875,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 875
840 ptrace_bts_write_record(tsk, &rec); 876 ptrace_bts_write_record(tsk, &rec);
841} 877}
842#endif /* X86_BTS */ 878
879static const struct bts_configuration bts_cfg_netburst = {
880 .sizeof_bts = sizeof(long) * 3,
881 .sizeof_field = sizeof(long),
882 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
883};
884
885static const struct bts_configuration bts_cfg_pentium_m = {
886 .sizeof_bts = sizeof(long) * 3,
887 .sizeof_field = sizeof(long),
888 .debugctl_mask = (1<<6)|(1<<7)
889};
890
891static const struct bts_configuration bts_cfg_core2 = {
892 .sizeof_bts = 8 * 3,
893 .sizeof_field = 8,
894 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
895};
896
897static inline void bts_configure(const struct bts_configuration *cfg)
898{
899 bts_cfg = *cfg;
900}
901
902void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
903{
904 switch (c->x86) {
905 case 0x6:
906 switch (c->x86_model) {
907 case 0xD:
908 case 0xE: /* Pentium M */
909 bts_configure(&bts_cfg_pentium_m);
910 break;
911 case 0xF: /* Core2 */
912 case 0x1C: /* Atom */
913 bts_configure(&bts_cfg_core2);
914 break;
915 default:
916 /* sorry, don't know about them */
917 break;
918 }
919 break;
920 case 0xF:
921 switch (c->x86_model) {
922 case 0x0:
923 case 0x1:
924 case 0x2: /* Netburst */
925 bts_configure(&bts_cfg_netburst);
926 break;
927 default:
928 /* sorry, don't know about them */
929 break;
930 }
931 break;
932 default:
933 /* sorry, don't know about them */
934 break;
935 }
936}
937#endif /* CONFIG_X86_PTRACE_BTS */
843 938
844/* 939/*
845 * Called by kernel/ptrace.c when detaching.. 940 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +947,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 947#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 948 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 949#endif
855 if (child->thread.ds_area_msr) { 950#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 951 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 952
858#endif 953 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 954 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 955 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 956
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 957 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 958#endif /* CONFIG_X86_PTRACE_BTS */
864} 959}
865 960
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 961#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1075,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1075 /*
981 * These bits need more cooking - not enabled yet: 1076 * These bits need more cooking - not enabled yet:
982 */ 1077 */
983#ifdef X86_BTS 1078#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1079 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1080 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1081 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1087 break;
993 1088
994 case PTRACE_BTS_SIZE: 1089 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1090 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1091 break;
997 1092
998 case PTRACE_BTS_GET: 1093 case PTRACE_BTS_GET:
@@ -1001,14 +1096,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1096 break;
1002 1097
1003 case PTRACE_BTS_CLEAR: 1098 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1099 ret = ds_clear_bts(child);
1005 break; 1100 break;
1006 1101
1007 case PTRACE_BTS_DRAIN: 1102 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1103 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1104 (child, data, (struct bts_struct __user *) addr);
1010 break; 1105 break;
1011#endif 1106#endif /* CONFIG_X86_PTRACE_BTS */
1012 1107
1013 default: 1108 default:
1014 ret = ptrace_request(child, request, addr, data); 1109 ret = ptrace_request(child, request, addr, data);
@@ -1375,30 +1470,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1375 force_sig_info(SIGTRAP, &info, tsk); 1470 force_sig_info(SIGTRAP, &info, tsk);
1376} 1471}
1377 1472
1378static void syscall_trace(struct pt_regs *regs)
1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1382
1383#if 0
1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1385 current->comm,
1386 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1387 current_thread_info()->flags, current->ptrace);
1388#endif
1389
1390 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1391 ? 0x80 : 0));
1392 /*
1393 * this isn't the same as continuing with a signal, but it will do
1394 * for normal use. strace only continues with a signal if the
1395 * stopping signal is not SIGTRAP. -brl
1396 */
1397 if (current->exit_code) {
1398 send_sig(current->exit_code, current, 1);
1399 current->exit_code = 0;
1400 }
1401}
1402 1473
1403#ifdef CONFIG_X86_32 1474#ifdef CONFIG_X86_32
1404# define IS_IA32 1 1475# define IS_IA32 1
@@ -1432,8 +1503,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1503 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1433 ret = -1L; 1504 ret = -1L;
1434 1505
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) 1506 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1436 syscall_trace(regs); 1507 tracehook_report_syscall_entry(regs))
1508 ret = -1L;
1437 1509
1438 if (unlikely(current->audit_context)) { 1510 if (unlikely(current->audit_context)) {
1439 if (IS_IA32) 1511 if (IS_IA32)
@@ -1459,7 +1531,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1531 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1460 1532
1461 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1533 if (test_thread_flag(TIF_SYSCALL_TRACE))
1462 syscall_trace(regs); 1534 tracehook_report_syscall_exit(regs, 0);
1463 1535
1464 /* 1536 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of 1537 * If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1547,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1475 * system call instruction. 1547 * system call instruction.
1476 */ 1548 */
1477 if (test_thread_flag(TIF_SINGLESTEP) && 1549 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED)) 1550 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1479 send_sigtrap(current, regs, 0); 1551 send_sigtrap(current, regs, 0);
1480} 1552}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb9..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 703310a99023..6f50664b2ba5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,10 +20,11 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are 23/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * used to save some data for jumping back 24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
25 */ 26 */
26#define DATA(offset) (PAGE_SIZE/2+(offset)) 27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
27 28
28/* Minimal CPU state */ 29/* Minimal CPU state */
29#define ESP DATA(0x0) 30#define ESP DATA(0x0)
@@ -376,3 +377,6 @@ swap_pages:
376 popl %ebx 377 popl %ebx
377 popl %ebp 378 popl %ebp
378 ret 379 ret
380
381 .globl kexec_control_code_size
382.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 59f07e14d083..46c98efbbf8d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
223#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
224 224
225static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
226 229
227#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
228struct edd edd; 231struct edd edd;
@@ -445,7 +448,7 @@ static void __init reserve_early_setup_data(void)
445 * @size: Size of the crashkernel memory to reserve. 448 * @size: Size of the crashkernel memory to reserve.
446 * Returns the base address on success, and -1ULL on failure. 449 * Returns the base address on success, and -1ULL on failure.
447 */ 450 */
448unsigned long long find_and_reserve_crashkernel(unsigned long long size) 451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
449{ 452{
450 const unsigned long long alignment = 16<<20; /* 16M */ 453 const unsigned long long alignment = 16<<20; /* 16M */
451 unsigned long long start = 0LL; 454 unsigned long long start = 0LL;
@@ -604,14 +607,6 @@ void __init setup_arch(char **cmdline_p)
604 early_cpu_init(); 607 early_cpu_init();
605 early_ioremap_init(); 608 early_ioremap_init();
606 609
607#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
608 /*
609 * Must be before kernel pagetables are setup
610 * or fixmap area is touched.
611 */
612 vmi_init();
613#endif
614
615 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 610 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
616 screen_info = boot_params.screen_info; 611 screen_info = boot_params.screen_info;
617 edid_info = boot_params.edid_info; 612 edid_info = boot_params.edid_info;
@@ -673,11 +668,36 @@ void __init setup_arch(char **cmdline_p)
673 bss_resource.start = virt_to_phys(&__bss_start); 668 bss_resource.start = virt_to_phys(&__bss_start);
674 bss_resource.end = virt_to_phys(&__bss_stop)-1; 669 bss_resource.end = virt_to_phys(&__bss_stop)-1;
675 670
671#ifdef CONFIG_CMDLINE_BOOL
672#ifdef CONFIG_CMDLINE_OVERRIDE
673 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
674#else
675 if (builtin_cmdline[0]) {
676 /* append boot loader cmdline to builtin */
677 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
678 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
679 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
680 }
681#endif
682#endif
683
676 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 684 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
677 *cmdline_p = command_line; 685 *cmdline_p = command_line;
678 686
679 parse_early_param(); 687 parse_early_param();
680 688
689#ifdef CONFIG_X86_64
690 check_efer();
691#endif
692
693#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
694 /*
695 * Must be before kernel pagetables are setup
696 * or fixmap area is touched.
697 */
698 vmi_init();
699#endif
700
681 /* after early param, so could get panic from serial */ 701 /* after early param, so could get panic from serial */
682 reserve_early_setup_data(); 702 reserve_early_setup_data();
683 703
@@ -738,7 +758,6 @@ void __init setup_arch(char **cmdline_p)
738#else 758#else
739 num_physpages = max_pfn; 759 num_physpages = max_pfn;
740 760
741 check_efer();
742 if (cpu_has_x2apic) 761 if (cpu_has_x2apic)
743 check_x2apic(); 762 check_x2apic();
744 763
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2dc..8b4956e800ac 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -24,4 +24,9 @@ struct rt_sigframe {
24 struct ucontext uc; 24 struct ucontext uc;
25 struct siginfo info; 25 struct siginfo info;
26}; 26};
27
28int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
29 sigset_t *set, struct pt_regs *regs);
30int ia32_setup_frame(int sig, struct k_sigaction *ka,
31 sigset_t *set, struct pt_regs *regs);
27#endif 32#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 0c727f64e79b..2a2435d3037d 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -559,8 +560,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
559 * handler too. 560 * handler too.
560 */ 561 */
561 regs->flags &= ~X86_EFLAGS_TF; 562 regs->flags &= ~X86_EFLAGS_TF;
562 if (test_thread_flag(TIF_SINGLESTEP))
563 ptrace_notify(SIGTRAP);
564 563
565 spin_lock_irq(&current->sighand->siglock); 564 spin_lock_irq(&current->sighand->siglock);
566 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 565 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -569,6 +568,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
569 recalc_sigpending(); 568 recalc_sigpending();
570 spin_unlock_irq(&current->sighand->siglock); 569 spin_unlock_irq(&current->sighand->siglock);
571 570
571 tracehook_signal_handler(sig, info, ka, regs,
572 test_thread_flag(TIF_SINGLESTEP));
573
572 return 0; 574 return 0;
573} 575}
574 576
@@ -662,5 +664,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
662 if (thread_info_flags & _TIF_SIGPENDING) 664 if (thread_info_flags & _TIF_SIGPENDING)
663 do_signal(regs); 665 do_signal(regs);
664 666
667 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
668 clear_thread_flag(TIF_NOTIFY_RESUME);
669 tracehook_notify_resume(regs);
670 }
671
665 clear_thread_flag(TIF_IRET); 672 clear_thread_flag(TIF_IRET);
666} 673}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 2f1464050059..694aa888bb19 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,20 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
29#include <asm/syscalls.h> 32#include <asm/syscalls.h>
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -42,11 +45,6 @@
42# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
43#endif 46#endif
44 47
45int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
46 sigset_t *set, struct pt_regs * regs);
47int ia32_setup_frame(int sig, struct k_sigaction *ka,
48 sigset_t *set, struct pt_regs * regs);
49
50asmlinkage long 48asmlinkage long
51sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
52 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -129,7 +127,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
129 /* Always make any pending restarted system calls return -EINTR */ 127 /* Always make any pending restarted system calls return -EINTR */
130 current_thread_info()->restart_block.fn = do_no_restart_syscall; 128 current_thread_info()->restart_block.fn = do_no_restart_syscall;
131 129
132#define COPY(x) err |= __get_user(regs->x, &sc->x) 130#define COPY(x) (err |= __get_user(regs->x, &sc->x))
133 131
134 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 132 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
135 COPY(dx); COPY(cx); COPY(ip); 133 COPY(dx); COPY(cx); COPY(ip);
@@ -159,7 +157,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
159 } 157 }
160 158
161 { 159 {
162 struct _fpstate __user * buf; 160 struct _fpstate __user *buf;
163 err |= __get_user(buf, &sc->fpstate); 161 err |= __get_user(buf, &sc->fpstate);
164 162
165 if (buf) { 163 if (buf) {
@@ -199,7 +197,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
199 current->blocked = set; 197 current->blocked = set;
200 recalc_sigpending(); 198 recalc_sigpending();
201 spin_unlock_irq(&current->sighand->siglock); 199 spin_unlock_irq(&current->sighand->siglock);
202 200
203 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 201 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
204 goto badframe; 202 goto badframe;
205 203
@@ -209,16 +207,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
209 return ax; 207 return ax;
210 208
211badframe: 209badframe:
212 signal_fault(regs,frame,"sigreturn"); 210 signal_fault(regs, frame, "sigreturn");
213 return 0; 211 return 0;
214} 212}
215 213
216/* 214/*
217 * Set up a signal frame. 215 * Set up a signal frame.
218 */ 216 */
219 217
220static inline int 218static inline int
221setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 219setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
220 unsigned long mask, struct task_struct *me)
222{ 221{
223 int err = 0; 222 int err = 0;
224 223
@@ -274,35 +273,35 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
274} 273}
275 274
276static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 275static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
277 sigset_t *set, struct pt_regs * regs) 276 sigset_t *set, struct pt_regs *regs)
278{ 277{
279 struct rt_sigframe __user *frame; 278 struct rt_sigframe __user *frame;
280 struct _fpstate __user *fp = NULL; 279 struct _fpstate __user *fp = NULL;
281 int err = 0; 280 int err = 0;
282 struct task_struct *me = current; 281 struct task_struct *me = current;
283 282
284 if (used_math()) { 283 if (used_math()) {
285 fp = get_stack(ka, regs, sizeof(struct _fpstate)); 284 fp = get_stack(ka, regs, sizeof(struct _fpstate));
286 frame = (void __user *)round_down( 285 frame = (void __user *)round_down(
287 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 286 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
288 287
289 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) 288 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
290 goto give_sigsegv; 289 goto give_sigsegv;
291 290
292 if (save_i387(fp) < 0) 291 if (save_i387(fp) < 0)
293 err |= -1; 292 err |= -1;
294 } else 293 } else
295 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 294 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
296 295
297 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 296 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
298 goto give_sigsegv; 297 goto give_sigsegv;
299 298
300 if (ka->sa.sa_flags & SA_SIGINFO) { 299 if (ka->sa.sa_flags & SA_SIGINFO) {
301 err |= copy_siginfo_to_user(&frame->info, info); 300 err |= copy_siginfo_to_user(&frame->info, info);
302 if (err) 301 if (err)
303 goto give_sigsegv; 302 goto give_sigsegv;
304 } 303 }
305 304
306 /* Create the ucontext. */ 305 /* Create the ucontext. */
307 err |= __put_user(0, &frame->uc.uc_flags); 306 err |= __put_user(0, &frame->uc.uc_flags);
308 err |= __put_user(0, &frame->uc.uc_link); 307 err |= __put_user(0, &frame->uc.uc_link);
@@ -312,9 +311,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
312 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 311 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
313 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 312 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
314 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 313 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
315 if (sizeof(*set) == 16) { 314 if (sizeof(*set) == 16) {
316 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 315 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
317 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 316 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
318 } else 317 } else
319 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 318 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
320 319
@@ -325,7 +324,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
325 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 324 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
326 } else { 325 } else {
327 /* could use a vstub here */ 326 /* could use a vstub here */
328 goto give_sigsegv; 327 goto give_sigsegv;
329 } 328 }
330 329
331 if (err) 330 if (err)
@@ -333,7 +332,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
333 332
334 /* Set up registers for signal handler */ 333 /* Set up registers for signal handler */
335 regs->di = sig; 334 regs->di = sig;
336 /* In case the signal handler was declared without prototypes */ 335 /* In case the signal handler was declared without prototypes */
337 regs->ax = 0; 336 regs->ax = 0;
338 337
339 /* This also works for non SA_SIGINFO handlers because they expect the 338 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -356,37 +355,8 @@ give_sigsegv:
356} 355}
357 356
358/* 357/*
359 * Return -1L or the syscall number that @regs is executing.
360 */
361static long current_syscall(struct pt_regs *regs)
362{
363 /*
364 * We always sign-extend a -1 value being set here,
365 * so this is always either -1L or a syscall number.
366 */
367 return regs->orig_ax;
368}
369
370/*
371 * Return a value that is -EFOO if the system call in @regs->orig_ax
372 * returned an error. This only works for @regs from @current.
373 */
374static long current_syscall_ret(struct pt_regs *regs)
375{
376#ifdef CONFIG_IA32_EMULATION
377 if (test_thread_flag(TIF_IA32))
378 /*
379 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
380 * and will match correctly in comparisons.
381 */
382 return (int) regs->ax;
383#endif
384 return regs->ax;
385}
386
387/*
388 * OK, we're invoking a handler 358 * OK, we're invoking a handler
389 */ 359 */
390 360
391static int 361static int
392handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 362handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -395,9 +365,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
395 int ret; 365 int ret;
396 366
397 /* Are we from a system call? */ 367 /* Are we from a system call? */
398 if (current_syscall(regs) >= 0) { 368 if (syscall_get_nr(current, regs) >= 0) {
399 /* If so, check system call restarting.. */ 369 /* If so, check system call restarting.. */
400 switch (current_syscall_ret(regs)) { 370 switch (syscall_get_error(current, regs)) {
401 case -ERESTART_RESTARTBLOCK: 371 case -ERESTART_RESTARTBLOCK:
402 case -ERESTARTNOHAND: 372 case -ERESTARTNOHAND:
403 regs->ax = -EINTR; 373 regs->ax = -EINTR;
@@ -430,7 +400,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
430 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); 400 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
431 else 401 else
432 ret = ia32_setup_frame(sig, ka, oldset, regs); 402 ret = ia32_setup_frame(sig, ka, oldset, regs);
433 } else 403 } else
434#endif 404#endif
435 ret = setup_rt_frame(sig, ka, info, oldset, regs); 405 ret = setup_rt_frame(sig, ka, info, oldset, regs);
436 406
@@ -454,15 +424,16 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
454 * handler too. 424 * handler too.
455 */ 425 */
456 regs->flags &= ~X86_EFLAGS_TF; 426 regs->flags &= ~X86_EFLAGS_TF;
457 if (test_thread_flag(TIF_SINGLESTEP))
458 ptrace_notify(SIGTRAP);
459 427
460 spin_lock_irq(&current->sighand->siglock); 428 spin_lock_irq(&current->sighand->siglock);
461 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask); 429 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
462 if (!(ka->sa.sa_flags & SA_NODEFER)) 430 if (!(ka->sa.sa_flags & SA_NODEFER))
463 sigaddset(&current->blocked,sig); 431 sigaddset(&current->blocked, sig);
464 recalc_sigpending(); 432 recalc_sigpending();
465 spin_unlock_irq(&current->sighand->siglock); 433 spin_unlock_irq(&current->sighand->siglock);
434
435 tracehook_signal_handler(sig, info, ka, regs,
436 test_thread_flag(TIF_SINGLESTEP));
466 } 437 }
467 438
468 return ret; 439 return ret;
@@ -519,9 +490,9 @@ static void do_signal(struct pt_regs *regs)
519 } 490 }
520 491
521 /* Did we come from a system call? */ 492 /* Did we come from a system call? */
522 if (current_syscall(regs) >= 0) { 493 if (syscall_get_nr(current, regs) >= 0) {
523 /* Restart the system call - no handlers present */ 494 /* Restart the system call - no handlers present */
524 switch (current_syscall_ret(regs)) { 495 switch (syscall_get_error(current, regs)) {
525 case -ERESTARTNOHAND: 496 case -ERESTARTNOHAND:
526 case -ERESTARTSYS: 497 case -ERESTARTSYS:
527 case -ERESTARTNOINTR: 498 case -ERESTARTNOINTR:
@@ -559,17 +530,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
559 /* deal with pending signal delivery */ 530 /* deal with pending signal delivery */
560 if (thread_info_flags & _TIF_SIGPENDING) 531 if (thread_info_flags & _TIF_SIGPENDING)
561 do_signal(regs); 532 do_signal(regs);
533
534 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
535 clear_thread_flag(TIF_NOTIFY_RESUME);
536 tracehook_notify_resume(regs);
537 }
562} 538}
563 539
564void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 540void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
565{ 541{
566 struct task_struct *me = current; 542 struct task_struct *me = current;
567 if (show_unhandled_signals && printk_ratelimit()) { 543 if (show_unhandled_signals && printk_ratelimit()) {
568 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 544 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
569 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 545 me->comm, me->pid, where, frame, regs->ip,
546 regs->sp, regs->orig_ax);
570 print_vma_addr(" in ", regs->ip); 547 print_vma_addr(" in ", regs->ip);
571 printk("\n"); 548 printk("\n");
572 } 549 }
573 550
574 force_sig(SIGSEGV, me); 551 force_sig(SIGSEGV, me);
575} 552}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0133a952d11f..2ff0bbcd5bd1 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -749,6 +749,14 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
749} 749}
750 750
751#ifdef CONFIG_X86_64 751#ifdef CONFIG_X86_64
752
753/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
754static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
755{
756 if (!after_bootmem)
757 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
758}
759
752/* 760/*
753 * Allocate node local memory for the AP pda. 761 * Allocate node local memory for the AP pda.
754 * 762 *
@@ -777,8 +785,7 @@ int __cpuinit get_local_pda(int cpu)
777 785
778 if (oldpda) { 786 if (oldpda) {
779 memcpy(newpda, oldpda, size); 787 memcpy(newpda, oldpda, size);
780 if (!after_bootmem) 788 free_bootmem_pda(oldpda);
781 free_bootmem((unsigned long)oldpda, size);
782 } 789 }
783 790
784 newpda->in_bootmem = 0; 791 newpda->in_bootmem = 0;
@@ -987,17 +994,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
987 flush_tlb_all(); 994 flush_tlb_all();
988 low_mappings = 1; 995 low_mappings = 1;
989 996
990#ifdef CONFIG_X86_PC
991 if (def_to_bigsmp && apicid > 8) {
992 printk(KERN_WARNING
993 "More than 8 CPUs detected - skipping them.\n"
994 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
995 err = -1;
996 } else
997 err = do_boot_cpu(apicid, cpu);
998#else
999 err = do_boot_cpu(apicid, cpu); 997 err = do_boot_cpu(apicid, cpu);
1000#endif
1001 998
1002 zap_low_mappings(); 999 zap_low_mappings();
1003 low_mappings = 0; 1000 low_mappings = 0;
@@ -1051,6 +1048,34 @@ static __init void disable_smp(void)
1051static int __init smp_sanity_check(unsigned max_cpus) 1048static int __init smp_sanity_check(unsigned max_cpus)
1052{ 1049{
1053 preempt_disable(); 1050 preempt_disable();
1051
1052#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
1053 if (def_to_bigsmp && nr_cpu_ids > 8) {
1054 unsigned int cpu;
1055 unsigned nr;
1056
1057 printk(KERN_WARNING
1058 "More than 8 CPUs detected - skipping them.\n"
1059 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
1060
1061 nr = 0;
1062 for_each_present_cpu(cpu) {
1063 if (nr >= 8)
1064 cpu_clear(cpu, cpu_present_map);
1065 nr++;
1066 }
1067
1068 nr = 0;
1069 for_each_possible_cpu(cpu) {
1070 if (nr >= 8)
1071 cpu_clear(cpu, cpu_possible_map);
1072 nr++;
1073 }
1074
1075 nr_cpu_ids = 8;
1076 }
1077#endif
1078
1054 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1079 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1055 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1080 printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
1056 "by the BIOS.\n", hard_smp_processor_id()); 1081 "by the BIOS.\n", hard_smp_processor_id());
@@ -1196,6 +1221,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1196 printk(KERN_INFO "CPU%d: ", 0); 1221 printk(KERN_INFO "CPU%d: ", 0);
1197 print_cpu_info(&cpu_data(0)); 1222 print_cpu_info(&cpu_data(0));
1198 setup_boot_clock(); 1223 setup_boot_clock();
1224
1225 if (is_uv_system())
1226 uv_system_init();
1199out: 1227out:
1200 preempt_enable(); 1228 preempt_enable();
1201} 1229}
@@ -1285,16 +1313,13 @@ __init void prefill_possible_map(void)
1285 if (!num_processors) 1313 if (!num_processors)
1286 num_processors = 1; 1314 num_processors = 1;
1287 1315
1288#ifdef CONFIG_HOTPLUG_CPU
1289 if (additional_cpus == -1) { 1316 if (additional_cpus == -1) {
1290 if (disabled_cpus > 0) 1317 if (disabled_cpus > 0)
1291 additional_cpus = disabled_cpus; 1318 additional_cpus = disabled_cpus;
1292 else 1319 else
1293 additional_cpus = 0; 1320 additional_cpus = 0;
1294 } 1321 }
1295#else 1322
1296 additional_cpus = 0;
1297#endif
1298 possible = num_processors + additional_cpus; 1323 possible = num_processors + additional_cpus;
1299 if (possible > NR_CPUS) 1324 if (possible > NR_CPUS)
1300 possible = NR_CPUS; 1325 possible = NR_CPUS;
@@ -1386,17 +1411,3 @@ void __cpu_die(unsigned int cpu)
1386 BUG(); 1411 BUG();
1387} 1412}
1388#endif 1413#endif
1389
1390/*
1391 * If the BIOS enumerates physical processors before logical,
1392 * maxcpus=N at enumeration-time can be used to disable HT.
1393 */
1394static int __init parse_maxcpus(char *arg)
1395{
1396 extern unsigned int maxcpus;
1397
1398 if (arg)
1399 maxcpus = simple_strtoul(arg, NULL, 0);
1400 return 0;
1401}
1402early_param("maxcpus", parse_maxcpus);
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca0..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
8DEFINE_PER_CPU(unsigned long, this_cpu_off); 8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off); 9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10 10
11/* Initialize the CPU's GDT. This is either the boot CPU doing itself 11/*
12 (still using the master per-cpu area), or a CPU doing it for a 12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 secondary which will soon come up. */ 13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
14__cpuinit void init_gdt(int cpu) 16__cpuinit void init_gdt(int cpu)
15{ 17{
16 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 18 struct desc_struct gdt;
17 19
18 pack_descriptor(&gdt[GDT_ENTRY_PERCPU], 20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x2 | DESCTYPE_S, 0x8); 21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
21 23
22 gdt[GDT_ENTRY_PERCPU].s = 1; 24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
23 26
24 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
25 per_cpu(cpu_number, cpu) = cpu; 28 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index c9288c883e20..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,16 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
22 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
23{ 24{
24 long error; 25 long error;
25 struct file * file; 26 struct file *file;
26 27
27 error = -EINVAL; 28 error = -EINVAL;
28 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -57,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
57 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
58 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
59 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
60 of playground for now. -AK */ 61 of playground for now. -AK */
61 *begin = 0x40000000; 62 *begin = 0x40000000;
62 *end = 0x80000000; 63 *end = 0x80000000;
63 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
64 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
65 if (new_begin) 66 if (new_begin)
@@ -67,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
67 } 68 }
68 } else { 69 } else {
69 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
70 *end = TASK_SIZE; 71 *end = TASK_SIZE;
71 } 72 }
72} 73}
73 74
74unsigned long 75unsigned long
75arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -79,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
79 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
80 unsigned long start_addr; 81 unsigned long start_addr;
81 unsigned long begin, end; 82 unsigned long begin, end;
82 83
83 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
84 return addr; 85 return addr;
85 86
86 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
87 88
88 if (len > end) 89 if (len > end)
89 return -ENOMEM; 90 return -ENOMEM;
@@ -97,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
97 } 98 }
98 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
99 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
100 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
101 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
102 } 103 }
103 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
104 if (addr < begin) 105 if (addr < begin)
105 addr = begin; 106 addr = begin;
106 start_addr = addr; 107 start_addr = addr;
107 108
108full_search: 109full_search:
@@ -128,7 +129,7 @@ full_search:
128 return addr; 129 return addr;
129 } 130 }
130 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
131 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
132 133
133 addr = vma->vm_end; 134 addr = vma->vm_end;
134 } 135 }
@@ -178,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
178 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
179 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
180 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
181 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
182 } 183 }
183 184
184 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -195,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
196 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
197 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
198 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
199 200
200 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
201 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -225,13 +226,13 @@ bottomup:
225} 226}
226 227
227 228
228asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
229{ 230{
230 int err; 231 int err;
231 down_read(&uts_sem); 232 down_read(&uts_sem);
232 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
233 up_read(&uts_sem); 234 up_read(&uts_sem);
234 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
235 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
236 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
237} 238}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d0fbb7712ab0..8b8c0d6640fa 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -17,6 +17,7 @@
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/tsc.h> 19#include <asm/tsc.h>
20#include <asm/irq_vectors.h>
20 21
21#include <mach_apic.h> 22#include <mach_apic.h>
22 23
@@ -783,7 +784,7 @@ static int __init uv_bau_init(void)
783 uv_init_blade(blade, node, cur_cpu); 784 uv_init_blade(blade, node, cur_cpu);
784 cur_cpu += uv_blade_nr_possible_cpus(blade); 785 cur_cpu += uv_blade_nr_possible_cpus(blade);
785 } 786 }
786 set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 787 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
787 uv_enable_timeouts(); 788 uv_enable_timeouts();
788 789
789 return 0; 790 return 0;
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 513caaca7115..7a31f104bef9 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -32,6 +32,8 @@
32#include <linux/bug.h> 32#include <linux/bug.h>
33#include <linux/nmi.h> 33#include <linux/nmi.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/smp.h>
36#include <linux/io.h>
35 37
36#if defined(CONFIG_EDAC) 38#if defined(CONFIG_EDAC)
37#include <linux/edac.h> 39#include <linux/edac.h>
@@ -45,9 +47,6 @@
45#include <asm/unwind.h> 47#include <asm/unwind.h>
46#include <asm/desc.h> 48#include <asm/desc.h>
47#include <asm/i387.h> 49#include <asm/i387.h>
48#include <asm/nmi.h>
49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h> 50#include <asm/pgalloc.h>
52#include <asm/proto.h> 51#include <asm/proto.h>
53#include <asm/pda.h> 52#include <asm/pda.h>
@@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
85 84
86void printk_address(unsigned long address, int reliable) 85void printk_address(unsigned long address, int reliable)
87{ 86{
88 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); 87 printk(" [<%016lx>] %s%pS\n",
88 address, reliable ? "" : "? ", (void *) address);
89} 89}
90 90
91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
98 [STACKFAULT_STACK - 1] = "#SS", 98 [STACKFAULT_STACK - 1] = "#SS",
99 [MCE_STACK - 1] = "#MC", 99 [MCE_STACK - 1] = "#MC",
100#if DEBUG_STKSZ > EXCEPTION_STKSZ 100#if DEBUG_STKSZ > EXCEPTION_STKSZ
101 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 101 [N_EXCEPTION_STACKS ...
102 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
102#endif 103#endif
103 }; 104 };
104 unsigned k; 105 unsigned k;
@@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
163} 164}
164 165
165/* 166/*
166 * x86-64 can have up to three kernel stacks: 167 * x86-64 can have up to three kernel stacks:
167 * process stack 168 * process stack
168 * interrupt stack 169 * interrupt stack
169 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 170 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
219 const struct stacktrace_ops *ops, void *data) 220 const struct stacktrace_ops *ops, void *data)
220{ 221{
221 const unsigned cpu = get_cpu(); 222 const unsigned cpu = get_cpu();
222 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; 223 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
223 unsigned used = 0; 224 unsigned used = 0;
224 struct thread_info *tinfo; 225 struct thread_info *tinfo;
225 226
@@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
237 if (!bp) { 238 if (!bp) {
238 if (task == current) { 239 if (task == current) {
239 /* Grab bp right from our regs */ 240 /* Grab bp right from our regs */
240 asm("movq %%rbp, %0" : "=r" (bp) :); 241 asm("movq %%rbp, %0" : "=r" (bp) : );
241 } else { 242 } else {
242 /* bp is the last reg pushed by switch_to */ 243 /* bp is the last reg pushed by switch_to */
243 bp = *(unsigned long *) task->thread.sp; 244 bp = *(unsigned long *) task->thread.sp;
@@ -339,9 +340,8 @@ static void
339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 340show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl) 341 unsigned long *stack, unsigned long bp, char *log_lvl)
341{ 342{
342 printk("\nCall Trace:\n"); 343 printk("Call Trace:\n");
343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 344 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
344 printk("\n");
345} 345}
346 346
347void show_trace(struct task_struct *task, struct pt_regs *regs, 347void show_trace(struct task_struct *task, struct pt_regs *regs,
@@ -357,11 +357,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
357 unsigned long *stack; 357 unsigned long *stack;
358 int i; 358 int i;
359 const int cpu = smp_processor_id(); 359 const int cpu = smp_processor_id();
360 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); 360 unsigned long *irqstack_end =
361 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 361 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
362 unsigned long *irqstack =
363 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
362 364
363 // debugging aid: "show_stack(NULL, NULL);" prints the 365 /*
364 // back trace for this cpu. 366 * debugging aid: "show_stack(NULL, NULL);" prints the
367 * back trace for this cpu.
368 */
365 369
366 if (sp == NULL) { 370 if (sp == NULL) {
367 if (task) 371 if (task)
@@ -386,6 +390,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
386 printk(" %016lx", *stack++); 390 printk(" %016lx", *stack++);
387 touch_nmi_watchdog(); 391 touch_nmi_watchdog();
388 } 392 }
393 printk("\n");
389 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 394 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
390} 395}
391 396
@@ -404,7 +409,7 @@ void dump_stack(void)
404 409
405#ifdef CONFIG_FRAME_POINTER 410#ifdef CONFIG_FRAME_POINTER
406 if (!bp) 411 if (!bp)
407 asm("movq %%rbp, %0" : "=r" (bp):); 412 asm("movq %%rbp, %0" : "=r" (bp) : );
408#endif 413#endif
409 414
410 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 415 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -414,7 +419,6 @@ void dump_stack(void)
414 init_utsname()->version); 419 init_utsname()->version);
415 show_trace(NULL, NULL, &stack, bp); 420 show_trace(NULL, NULL, &stack, bp);
416} 421}
417
418EXPORT_SYMBOL(dump_stack); 422EXPORT_SYMBOL(dump_stack);
419 423
420void show_registers(struct pt_regs *regs) 424void show_registers(struct pt_regs *regs)
@@ -443,7 +447,6 @@ void show_registers(struct pt_regs *regs)
443 printk("Stack: "); 447 printk("Stack: ");
444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 448 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, ""); 449 regs->bp, "");
446 printk("\n");
447 450
448 printk(KERN_EMERG "Code: "); 451 printk(KERN_EMERG "Code: ");
449 452
@@ -493,7 +496,7 @@ unsigned __kprobes long oops_begin(void)
493 raw_local_irq_save(flags); 496 raw_local_irq_save(flags);
494 cpu = smp_processor_id(); 497 cpu = smp_processor_id();
495 if (!__raw_spin_trylock(&die_lock)) { 498 if (!__raw_spin_trylock(&die_lock)) {
496 if (cpu == die_owner) 499 if (cpu == die_owner)
497 /* nested oops. should stop eventually */; 500 /* nested oops. should stop eventually */;
498 else 501 else
499 __raw_spin_lock(&die_lock); 502 __raw_spin_lock(&die_lock);
@@ -638,7 +641,7 @@ kernel_trap:
638} 641}
639 642
640#define DO_ERROR(trapnr, signr, str, name) \ 643#define DO_ERROR(trapnr, signr, str, name) \
641asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 644asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
642{ \ 645{ \
643 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 646 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
644 == NOTIFY_STOP) \ 647 == NOTIFY_STOP) \
@@ -648,7 +651,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
648} 651}
649 652
650#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 653#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
651asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 654asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
652{ \ 655{ \
653 siginfo_t info; \ 656 siginfo_t info; \
654 info.si_signo = signr; \ 657 info.si_signo = signr; \
@@ -683,7 +686,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
683 preempt_conditional_cli(regs); 686 preempt_conditional_cli(regs);
684} 687}
685 688
686asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) 689asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
687{ 690{
688 static const char str[] = "double fault"; 691 static const char str[] = "double fault";
689 struct task_struct *tsk = current; 692 struct task_struct *tsk = current;
@@ -778,9 +781,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
778} 781}
779 782
780static notrace __kprobes void 783static notrace __kprobes void
781unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 784unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
782{ 785{
783 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 786 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
787 NOTIFY_STOP)
784 return; 788 return;
785 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", 789 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
786 reason); 790 reason);
@@ -882,7 +886,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
882 else if (user_mode(eregs)) 886 else if (user_mode(eregs))
883 regs = task_pt_regs(current); 887 regs = task_pt_regs(current);
884 /* Exception from kernel and interrupts are enabled. Move to 888 /* Exception from kernel and interrupts are enabled. Move to
885 kernel process stack. */ 889 kernel process stack. */
886 else if (eregs->flags & X86_EFLAGS_IF) 890 else if (eregs->flags & X86_EFLAGS_IF)
887 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 891 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
888 if (eregs != regs) 892 if (eregs != regs)
@@ -891,7 +895,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
891} 895}
892 896
893/* runs on IST stack. */ 897/* runs on IST stack. */
894asmlinkage void __kprobes do_debug(struct pt_regs * regs, 898asmlinkage void __kprobes do_debug(struct pt_regs *regs,
895 unsigned long error_code) 899 unsigned long error_code)
896{ 900{
897 struct task_struct *tsk = current; 901 struct task_struct *tsk = current;
@@ -1035,7 +1039,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
1035 1039
1036asmlinkage void bad_intr(void) 1040asmlinkage void bad_intr(void)
1037{ 1041{
1038 printk("bad interrupt"); 1042 printk("bad interrupt");
1039} 1043}
1040 1044
1041asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1045asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1047,7 +1051,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1047 1051
1048 conditional_sti(regs); 1052 conditional_sti(regs);
1049 if (!user_mode(regs) && 1053 if (!user_mode(regs) &&
1050 kernel_math_error(regs, "kernel simd math error", 19)) 1054 kernel_math_error(regs, "kernel simd math error", 19))
1051 return; 1055 return;
1052 1056
1053 /* 1057 /*
@@ -1092,7 +1096,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1092 force_sig_info(SIGFPE, &info, task); 1096 force_sig_info(SIGFPE, &info, task);
1093} 1097}
1094 1098
1095asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) 1099asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
1096{ 1100{
1097} 1101}
1098 1102
@@ -1149,8 +1153,10 @@ void __init trap_init(void)
1149 set_intr_gate(0, &divide_error); 1153 set_intr_gate(0, &divide_error);
1150 set_intr_gate_ist(1, &debug, DEBUG_STACK); 1154 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1151 set_intr_gate_ist(2, &nmi, NMI_STACK); 1155 set_intr_gate_ist(2, &nmi, NMI_STACK);
1152 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ 1156 /* int3 can be called from all */
1153 set_system_gate(4, &overflow); /* int4 can be called from all */ 1157 set_system_gate_ist(3, &int3, DEBUG_STACK);
1158 /* int4 can be called from all */
1159 set_system_gate(4, &overflow);
1154 set_intr_gate(5, &bounds); 1160 set_intr_gate(5, &bounds);
1155 set_intr_gate(6, &invalid_op); 1161 set_intr_gate(6, &invalid_op);
1156 set_intr_gate(7, &device_not_available); 1162 set_intr_gate(7, &device_not_available);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7603c0553909..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -122,80 +122,390 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
122 return ULLONG_MAX; 122 return ULLONG_MAX;
123} 123}
124 124
125/** 125/*
126 * native_calibrate_tsc - calibrate the tsc on boot 126 * Calculate the TSC frequency from HPET reference
127 */ 127 */
128unsigned long native_calibrate_tsc(void) 128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{ 129{
130 unsigned long flags; 130 u64 tmp;
131 u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
132 int hpet = is_hpet_enabled();
133 unsigned int tsc_khz_val = 0;
134 131
135 local_irq_save(flags); 132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
136 148
137 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
172 * Try to calibrate the TSC against the Programmable
173 * Interrupt Timer and return the frequency of the TSC
174 * in kHz.
175 *
176 * Return ULONG_MAX on failure to calibrate.
177 */
178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
179{
180 u64 tsc, t1, t2, delta;
181 unsigned long tscmin, tscmax;
182 int pitcnt;
138 183
184 /* Set the Gate high, disable speaker */
139 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 185 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
140 186
187 /*
188 * Setup CTC channel 2* for mode 0, (interrupt on terminal
189 * count mode), binary count. Set the latch register to 50ms
190 * (LSB then MSB) to begin countdown.
191 */
141 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
142 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
143 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
144 tr1 = get_cycles(); 195
145 while ((inb(0x61) & 0x20) == 0); 196 tsc = t1 = t2 = get_cycles();
146 tr2 = get_cycles(); 197
198 pitcnt = 0;
199 tscmax = 0;
200 tscmin = ULONG_MAX;
201 while ((inb(0x61) & 0x20) == 0) {
202 t2 = get_cycles();
203 delta = t2 - tsc;
204 tsc = t2;
205 if ((unsigned long) delta < tscmin)
206 tscmin = (unsigned int) delta;
207 if ((unsigned long) delta > tscmax)
208 tscmax = (unsigned int) delta;
209 pitcnt++;
210 }
211
212 /*
213 * Sanity checks:
214 *
215 * If we were not able to read the PIT more than loopmin
216 * times, then we have been hit by a massive SMI
217 *
218 * If the maximum is 10 times larger than the minimum,
219 * then we got hit by an SMI as well.
220 */
221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
222 return ULONG_MAX;
223
224 /* Calculate the PIT value */
225 delta = t2 - t1;
226 do_div(delta, ms);
227 return delta;
228}
147 229
148 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
149 268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
347
348/**
349 * native_calibrate_tsc - calibrate the tsc on boot
350 */
351unsigned long native_calibrate_tsc(void)
352{
353 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate;
356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
150 local_irq_restore(flags); 360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
151 363
152 /* 364 /*
153 * Preset the result with the raw and inaccurate PIT 365 * Run 5 calibration loops to get the lowest frequency value
154 * calibration value 366 * (the best estimate). We use two different calibration modes
367 * here:
368 *
369 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
370 * load a timeout of 50ms. We read the time right after we
371 * started the timer and wait until the PIT count down reaches
372 * zero. In each wait loop iteration we read the TSC and check
373 * the delta to the previous read. We keep track of the min
374 * and max values of that delta. The delta is mostly defined
375 * by the IO time of the PIT access, so we can detect when a
376 * SMI/SMM disturbance happend between the two reads. If the
377 * maximum time is significantly larger than the minimum time,
378 * then we discard the result and have another try.
379 *
380 * 2) Reference counter. If available we use the HPET or the
381 * PMTIMER as a reference to check the sanity of that value.
382 * We use separate TSC readouts and check inside of the
383 * reference read for a SMI/SMM disturbance. We dicard
384 * disturbed values here as well. We do that around the PIT
385 * calibration delay loop as we have to wait for a certain
386 * amount of time anyway.
155 */ 387 */
156 delta = (tr2 - tr1); 388
157 do_div(delta, 50); 389 /* Preset PIT loop values */
158 tsc_khz_val = delta; 390 latch = CAL_LATCH;
159 391 ms = CAL_MS;
160 /* hpet or pmtimer available ? */ 392 loopmin = CAL_PIT_LOOPS;
161 if (!hpet && !pm1 && !pm2) { 393
162 printk(KERN_INFO "TSC calibrated against PIT\n"); 394 for (i = 0; i < 3; i++) {
163 goto out; 395 unsigned long tsc_pit_khz;
396
397 /*
398 * Read the start value and the reference count of
399 * hpet/pmtimer when available. Then do the PIT
400 * calibration, which will take at least 50ms, and
401 * read the end value.
402 */
403 local_irq_save(flags);
404 tsc1 = tsc_read_refs(&ref1, hpet);
405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
406 tsc2 = tsc_read_refs(&ref2, hpet);
407 local_irq_restore(flags);
408
409 /* Pick the lowest PIT TSC calibration so far */
410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
411
412 /* hpet or pmtimer available ? */
413 if (!hpet && !ref1 && !ref2)
414 continue;
415
416 /* Check, whether the sampling was disturbed by an SMI */
417 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
418 continue;
419
420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
425
426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
427
428 /* Check the reference deviation */
429 delta = ((u64) tsc_pit_min) * 100;
430 do_div(delta, tsc_ref_min);
431
432 /*
433 * If both calibration results are inside a 10% window
434 * then we can be sure, that the calibration
435 * succeeded. We break out of the loop right away. We
436 * use the reference value, as it is more precise.
437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
443 }
444
445 /*
446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
164 } 456 }
165 457
166 /* Check, whether the sampling was disturbed by an SMI */ 458 /*
167 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { 459 * Now check the results.
168 printk(KERN_WARNING "TSC calibration disturbed by SMI, " 460 */
169 "using PIT calibration result\n"); 461 if (tsc_pit_min == ULONG_MAX) {
170 goto out; 462 /* PIT gave no useful value */
463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
464
465 /* We don't have an alternative source, disable TSC */
466 if (!hpet && !ref1 && !ref2) {
467 printk("TSC: No reference (HPET/PMTIMER) available\n");
468 return 0;
469 }
470
471 /* The alternative source failed as well, disable TSC */
472 if (tsc_ref_min == ULONG_MAX) {
473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
474 "failed.\n");
475 return 0;
476 }
477
478 /* Use the alternative source */
479 printk(KERN_INFO "TSC: using %s reference calibration\n",
480 hpet ? "HPET" : "PMTIMER");
481
482 return tsc_ref_min;
171 } 483 }
172 484
173 tsc2 = (tsc2 - tsc1) * 1000000LL; 485 /* We don't have an alternative source, use the PIT calibration value */
174 486 if (!hpet && !ref1 && !ref2) {
175 if (hpet) { 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
176 printk(KERN_INFO "TSC calibrated against HPET\n"); 488 return tsc_pit_min;
177 if (hpet2 < hpet1)
178 hpet2 += 0x100000000ULL;
179 hpet2 -= hpet1;
180 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
181 do_div(tsc1, 1000000);
182 } else {
183 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
184 if (pm2 < pm1)
185 pm2 += (u64)ACPI_PM_OVRRUN;
186 pm2 -= pm1;
187 tsc1 = pm2 * 1000000000LL;
188 do_div(tsc1, PMTMR_TICKS_PER_SEC);
189 } 489 }
190 490
191 do_div(tsc2, tsc1); 491 /* The alternative source failed, use the PIT calibration value */
192 tsc_khz_val = tsc2; 492 if (tsc_ref_min == ULONG_MAX) {
493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
494 "Using PIT calibration\n");
495 return tsc_pit_min;
496 }
193 497
194out: 498 /*
195 return tsc_khz_val; 499 * The calibration values differ too much. In doubt, we use
500 * the PIT value as we know that there are PMTIMERs around
501 * running at double speed. At least we let the user know:
502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
506 return tsc_pit_min;
196} 507}
197 508
198
199#ifdef CONFIG_X86_32 509#ifdef CONFIG_X86_32
200/* Only called from the Powernow K7 cpu freq driver */ 510/* Only called from the Powernow K7 cpu freq driver */
201int recalibrate_cpu_khz(void) 511int recalibrate_cpu_khz(void)
@@ -314,7 +624,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
314 mark_tsc_unstable("cpufreq changes"); 624 mark_tsc_unstable("cpufreq changes");
315 } 625 }
316 626
317 set_cyc2ns_scale(tsc_khz_ref, freq->cpu); 627 set_cyc2ns_scale(tsc_khz, freq->cpu);
318 628
319 return 0; 629 return 0;
320} 630}
@@ -325,6 +635,10 @@ static struct notifier_block time_cpufreq_notifier_block = {
325 635
326static int __init cpufreq_tsc(void) 636static int __init cpufreq_tsc(void)
327{ 637{
638 if (!cpu_has_tsc)
639 return 0;
640 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
641 return 0;
328 cpufreq_register_notifier(&time_cpufreq_notifier_block, 642 cpufreq_register_notifier(&time_cpufreq_notifier_block,
329 CPUFREQ_TRANSITION_NOTIFIER); 643 CPUFREQ_TRANSITION_NOTIFIER);
330 return 0; 644 return 0;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0577825cf89b..9ffb01c31c40 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void)
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 } 90 }
91 if (!(now-start)) { 91 WARN(!(now-start),
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", 92 "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start); 93 now-start, end-start);
94 WARN_ON(1);
95 }
96} 94}
97 95
98/* 96/*
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 41e01b145c48..61a97e616f70 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h> 27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
28#include <asm/fixmap.h> 29#include <asm/fixmap.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/e820.h> 32#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <mach_ipi.h>
36 36
37#include "mach_apic.h" 37#include "mach_apic.h"
38 38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45 40
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h> 41#include <asm/i8259.h>
49#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h> 43#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53 44
54#include <linux/sched.h> 45#include <linux/sched.h>
55#include <linux/kernel.h> 46#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h> 47#include <linux/pci.h>
58#include <linux/pci_ids.h> 48#include <linux/pci_ids.h>
59 49
60extern int no_broadcast; 50extern int no_broadcast;
61 51
62#include <asm/io.h>
63#include <asm/apic.h> 52#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67 53
68char visws_board_type = -1; 54char visws_board_type = -1;
69char visws_board_rev = -1; 55char visws_board_rev = -1;
@@ -184,8 +170,6 @@ static int __init visws_get_smp_config(unsigned int early)
184 return 1; 170 return 1;
185} 171}
186 172
187extern unsigned int __cpuinitdata maxcpus;
188
189/* 173/*
190 * The Visual Workstation is Intel MP compliant in the hardware 174 * The Visual Workstation is Intel MP compliant in the hardware
191 * sense, but it doesn't have a BIOS(-configuration table). 175 * sense, but it doesn't have a BIOS(-configuration table).
@@ -244,8 +228,8 @@ static int __init visws_find_smp_config(unsigned int reserve)
244 ncpus = CO_CPU_MAX; 228 ncpus = CO_CPU_MAX;
245 } 229 }
246 230
247 if (ncpus > maxcpus) 231 if (ncpus > setup_max_cpus)
248 ncpus = maxcpus; 232 ncpus = setup_max_cpus;
249 233
250#ifdef CONFIG_X86_LOCAL_APIC 234#ifdef CONFIG_X86_LOCAL_APIC
251 smp_found_config = 1; 235 smp_found_config = 1;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 61531d5c9507..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
235 const void *desc) 235 const void *desc)
236{ 236{
237 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
238 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
239} 239}
240 240
241static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
393} 393}
394#endif 394#endif
395 395
396static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 397{
398 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 400}
401 401
402static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
403{ 403{
404 /* 404 /*
405 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 411}
412 412
413static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 414{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 418}
419 419
420static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
421{ 421{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 424}
425 425
426static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
427{ 427{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index cdb2363697d2..af5bdad84604 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -209,3 +209,11 @@ SECTIONS
209 209
210 DWARF_DEBUG 210 DWARF_DEBUG
211} 211}
212
213#ifdef CONFIG_KEXEC
214/* Link time checks */
215#include <asm/kexec.h>
216
217ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
218 "kexec control code size is too big")
219#endif
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {