aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-12 05:32:17 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-12 05:32:17 -0400
commit206855c321adee56db3946ca09a5887cddb9d598 (patch)
tree13a2729d4d0e37170552bd9ad3c6bba71ba0c55c /arch/x86/kernel
parente8d3f455de4f42d4bab2f6f1aeb2cf3bd18eb508 (diff)
parentcb58ffc3889f0545628f138f849e759a331b8ddc (diff)
Merge branch 'x86/urgent' into core/signal
Conflicts: arch/x86/kernel/signal_64.c
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c20
-rw-r--r--arch/x86/kernel/alternative.c8
-rw-r--r--arch/x86/kernel/amd_iommu.c350
-rw-r--r--arch/x86/kernel/amd_iommu_init.c194
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic_32.c396
-rw-r--r--arch/x86/kernel/apic_64.c403
-rw-r--r--arch/x86/kernel/apm_32.c3
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c88
-rw-r--r--arch/x86/kernel/cpu/amd.c548
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/centaur.c4
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c6
-rw-r--r--arch/x86/kernel/cpu/common.c987
-rw-r--r--arch/x86/kernel/cpu/common_64.c714
-rw-r--r--arch/x86/kernel/cpu/cpu.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c13
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c42
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c294
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c274
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c86
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c6
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/e820.c28
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/es7000_32.c345
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c7
-rw-r--r--arch/x86/kernel/genx2apic_phys.c7
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/io_apic_32.c37
-rw-r--r--arch/x86/kernel/io_apic_64.c31
-rw-r--r--arch/x86/kernel/irqinit_32.c49
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c177
-rw-r--r--arch/x86/kernel/pci-gart_64.c162
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c62
-rw-r--r--arch/x86/kernel/process_64.c44
-rw-r--r--arch/x86/kernel/ptrace.c444
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/setup.c16
-rw-r--r--arch/x86/kernel/setup_percpu.c9
-rw-r--r--arch/x86/kernel/smpboot.c1
-rw-r--r--arch/x86/kernel/traps_64.c5
-rw-r--r--arch/x86/kernel/tsc.c290
-rw-r--r--arch/x86/kernel/vmi_32.c12
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S9
-rw-r--r--arch/x86/kernel/vsmp_64.c2
74 files changed, 4735 insertions, 3073 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d6ea91abaebc..c9be69fedb70 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
71obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 71obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
72obj-$(CONFIG_X86_ES7000) += es7000_32.o
72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 73obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
73obj-y += vsmp_64.o 74obj-y += vsmp_64.o
74obj-$(CONFIG_KPROBES) += kprobes.o 75obj-$(CONFIG_KPROBES) += kprobes.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 267e684f33a7..c2ac1b4515a0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -252,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
252 return; 252 return;
253 } 253 }
254 254
255#ifdef CONFIG_X86_32
256 if (boot_cpu_physical_apicid != -1U) 255 if (boot_cpu_physical_apicid != -1U)
257 ver = apic_version[boot_cpu_physical_apicid]; 256 ver = apic_version[boot_cpu_physical_apicid];
258#endif
259 257
260 generic_processor_info(id, ver); 258 generic_processor_info(id, ver);
261} 259}
@@ -775,10 +773,8 @@ static void __init acpi_register_lapic_address(unsigned long address)
775 set_fixmap_nocache(FIX_APIC_BASE, address); 773 set_fixmap_nocache(FIX_APIC_BASE, address);
776 if (boot_cpu_physical_apicid == -1U) { 774 if (boot_cpu_physical_apicid == -1U) {
777 boot_cpu_physical_apicid = read_apic_id(); 775 boot_cpu_physical_apicid = read_apic_id();
778#ifdef CONFIG_X86_32
779 apic_version[boot_cpu_physical_apicid] = 776 apic_version[boot_cpu_physical_apicid] =
780 GET_APIC_VERSION(apic_read(APIC_LVR)); 777 GET_APIC_VERSION(apic_read(APIC_LVR));
781#endif
782 } 778 }
783} 779}
784 780
@@ -1606,6 +1602,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1606 */ 1602 */
1607 { 1603 {
1608 .callback = dmi_ignore_irq0_timer_override, 1604 .callback = dmi_ignore_irq0_timer_override,
1605 .ident = "HP nx6115 laptop",
1606 .matches = {
1607 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1608 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1609 },
1610 },
1611 {
1612 .callback = dmi_ignore_irq0_timer_override,
1609 .ident = "HP NX6125 laptop", 1613 .ident = "HP NX6125 laptop",
1610 .matches = { 1614 .matches = {
1611 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1615 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1620,6 +1624,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1620 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1624 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1621 }, 1625 },
1622 }, 1626 },
1627 {
1628 .callback = dmi_ignore_irq0_timer_override,
1629 .ident = "HP 6715b laptop",
1630 .matches = {
1631 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1632 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1633 },
1634 },
1623 {} 1635 {}
1624}; 1636};
1625 1637
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65a0c1b48696..fb04e49776ba 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -231,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
231 continue; 231 continue;
232 if (*ptr > text_end) 232 if (*ptr > text_end)
233 continue; 233 continue;
234 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ 234 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
235 }; 236 };
236} 237}
237 238
238static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
239{ 240{
240 u8 **ptr; 241 u8 **ptr;
241 char insn[1];
242 242
243 if (noreplace_smp) 243 if (noreplace_smp)
244 return; 244 return;
245 245
246 add_nops(insn, 1);
247 for (ptr = start; ptr < end; ptr++) { 246 for (ptr = start; ptr < end; ptr++) {
248 if (*ptr < text) 247 if (*ptr < text)
249 continue; 248 continue;
250 if (*ptr > text_end) 249 if (*ptr > text_end)
251 continue; 250 continue;
252 text_poke(*ptr, insn, 1); 251 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
253 }; 253 };
254} 254}
255 255
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 69b4d060b21c..34e4d112b1ef 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
33 33
34static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
35 35
36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
36/* 40/*
37 * general struct to manage commands send to an IOMMU 41 * general struct to manage commands send to an IOMMU
38 */ 42 */
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
51 55
52/**************************************************************************** 56/****************************************************************************
53 * 57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
54 * IOMMU command queuing functions 154 * IOMMU command queuing functions
55 * 155 *
56 ****************************************************************************/ 156 ****************************************************************************/
@@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
101 */ 201 */
102static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
103{ 203{
104 int ret, ready = 0; 204 int ret = 0, ready = 0;
105 unsigned status = 0; 205 unsigned status = 0;
106 struct iommu_cmd cmd; 206 struct iommu_cmd cmd;
107 unsigned long i = 0; 207 unsigned long flags, i = 0;
108 208
109 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
110 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
@@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
112 212
113 iommu->need_sync = 0; 213 iommu->need_sync = 0;
114 214
115 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
116 218
117 if (ret) 219 if (ret)
118 return ret; 220 goto out;
119 221
120 while (!ready && (i < EXIT_LOOP_COUNT)) { 222 while (!ready && (i < EXIT_LOOP_COUNT)) {
121 ++i; 223 ++i;
@@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
130 232
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
133 237
134 return 0; 238 return 0;
135} 239}
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
141{ 245{
142 struct iommu_cmd cmd; 246 struct iommu_cmd cmd;
247 int ret;
143 248
144 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
145 250
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
147 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
148 cmd.data[0] = devid; 253 cmd.data[0] = devid;
149 254
255 ret = iommu_queue_command(iommu, &cmd);
256
150 iommu->need_sync = 1; 257 iommu->need_sync = 1;
151 258
152 return iommu_queue_command(iommu, &cmd); 259 return ret;
153} 260}
154 261
155/* 262/*
@@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
159 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
160{ 267{
161 struct iommu_cmd cmd; 268 struct iommu_cmd cmd;
269 int ret;
162 270
163 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
164 address &= PAGE_MASK; 272 address &= PAGE_MASK;
@@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
173 281
282 ret = iommu_queue_command(iommu, &cmd);
283
174 iommu->need_sync = 1; 284 iommu->need_sync = 1;
175 285
176 return iommu_queue_command(iommu, &cmd); 286 return ret;
177} 287}
178 288
179/* 289/*
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
203 return 0; 313 return 0;
204} 314}
205 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
206/**************************************************************************** 324/****************************************************************************
207 * 325 *
208 * The functions below are used the create the page table mappings for 326 * The functions below are used the create the page table mappings for
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
362 * efficient allocator. 480 * efficient allocator.
363 * 481 *
364 ****************************************************************************/ 482 ****************************************************************************/
365static unsigned long dma_mask_to_pages(unsigned long mask)
366{
367 return (mask >> PAGE_SHIFT) +
368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
369}
370 483
371/* 484/*
372 * The address allocator core function. 485 * The address allocator core function.
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
375 */ 488 */
376static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
377 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
378 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
379{ 494{
380 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
381 unsigned long address; 496 unsigned long address;
382 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
383 unsigned long boundary_size; 497 unsigned long boundary_size;
384 498
385 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
386 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
387 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
388 503
389 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
390 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
391 508
392 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
393 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
394 if (address == -1) 511 if (address == -1) {
395 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
396 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
397 516
398 if (likely(address != -1)) { 517 if (likely(address != -1)) {
399 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
459 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
460 pages = last_page - start_page; 579 pages = last_page - start_page;
461 580
462 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
463} 582}
464 583
465static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -553,6 +672,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
553 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
554 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
555 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
556 /* Intialize the exclusion range if necessary */ 678 /* Intialize the exclusion range if necessary */
557 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
558 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
@@ -623,12 +745,13 @@ static void set_device_domain(struct amd_iommu *iommu,
623 745
624 u64 pte_root = virt_to_phys(domain->pt_root); 746 u64 pte_root = virt_to_phys(domain->pt_root);
625 747
626 pte_root |= (domain->mode & 0x07) << 9; 748 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
627 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 749 << DEV_ENTRY_MODE_SHIFT;
750 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
628 751
629 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 752 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
630 amd_iommu_dev_table[devid].data[0] = pte_root; 753 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
631 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 754 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
632 amd_iommu_dev_table[devid].data[2] = domain->id; 755 amd_iommu_dev_table[devid].data[2] = domain->id;
633 756
634 amd_iommu_pd_table[devid] = domain; 757 amd_iommu_pd_table[devid] = domain;
@@ -646,6 +769,45 @@ static void set_device_domain(struct amd_iommu *iommu,
646 *****************************************************************************/ 769 *****************************************************************************/
647 770
648/* 771/*
772 * This function checks if the driver got a valid device from the caller to
773 * avoid dereferencing invalid pointers.
774 */
775static bool check_device(struct device *dev)
776{
777 if (!dev || !dev->dma_mask)
778 return false;
779
780 return true;
781}
782
783/*
784 * In this function the list of preallocated protection domains is traversed to
785 * find the domain for a specific device
786 */
787static struct dma_ops_domain *find_protection_domain(u16 devid)
788{
789 struct dma_ops_domain *entry, *ret = NULL;
790 unsigned long flags;
791
792 if (list_empty(&iommu_pd_list))
793 return NULL;
794
795 spin_lock_irqsave(&iommu_pd_list_lock, flags);
796
797 list_for_each_entry(entry, &iommu_pd_list, list) {
798 if (entry->target_dev == devid) {
799 ret = entry;
800 list_del(&ret->list);
801 break;
802 }
803 }
804
805 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
806
807 return ret;
808}
809
810/*
649 * In the dma_ops path we only have the struct device. This function 811 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the 812 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device. 813 * requestor id for a given device.
@@ -661,27 +823,30 @@ static int get_device_resources(struct device *dev,
661 struct pci_dev *pcidev; 823 struct pci_dev *pcidev;
662 u16 _bdf; 824 u16 _bdf;
663 825
664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 826 *iommu = NULL;
827 *domain = NULL;
828 *bdf = 0xffff;
829
830 if (dev->bus != &pci_bus_type)
831 return 0;
665 832
666 pcidev = to_pci_dev(dev); 833 pcidev = to_pci_dev(dev);
667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 834 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
668 835
669 /* device not translated by any IOMMU in the system? */ 836 /* device not translated by any IOMMU in the system? */
670 if (_bdf > amd_iommu_last_bdf) { 837 if (_bdf > amd_iommu_last_bdf)
671 *iommu = NULL;
672 *domain = NULL;
673 *bdf = 0xffff;
674 return 0; 838 return 0;
675 }
676 839
677 *bdf = amd_iommu_alias_table[_bdf]; 840 *bdf = amd_iommu_alias_table[_bdf];
678 841
679 *iommu = amd_iommu_rlookup_table[*bdf]; 842 *iommu = amd_iommu_rlookup_table[*bdf];
680 if (*iommu == NULL) 843 if (*iommu == NULL)
681 return 0; 844 return 0;
682 dma_dom = (*iommu)->default_dom;
683 *domain = domain_for_device(*bdf); 845 *domain = domain_for_device(*bdf);
684 if (*domain == NULL) { 846 if (*domain == NULL) {
847 dma_dom = find_protection_domain(*bdf);
848 if (!dma_dom)
849 dma_dom = (*iommu)->default_dom;
685 *domain = &dma_dom->domain; 850 *domain = &dma_dom->domain;
686 set_device_domain(*iommu, *domain, *bdf); 851 set_device_domain(*iommu, *domain, *bdf);
687 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 852 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -760,17 +925,24 @@ static dma_addr_t __map_single(struct device *dev,
760 struct dma_ops_domain *dma_dom, 925 struct dma_ops_domain *dma_dom,
761 phys_addr_t paddr, 926 phys_addr_t paddr,
762 size_t size, 927 size_t size,
763 int dir) 928 int dir,
929 bool align,
930 u64 dma_mask)
764{ 931{
765 dma_addr_t offset = paddr & ~PAGE_MASK; 932 dma_addr_t offset = paddr & ~PAGE_MASK;
766 dma_addr_t address, start; 933 dma_addr_t address, start;
767 unsigned int pages; 934 unsigned int pages;
935 unsigned long align_mask = 0;
768 int i; 936 int i;
769 937
770 pages = iommu_num_pages(paddr, size); 938 pages = iommu_num_pages(paddr, size);
771 paddr &= PAGE_MASK; 939 paddr &= PAGE_MASK;
772 940
773 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 941 if (align)
942 align_mask = (1UL << get_order(size)) - 1;
943
944 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
945 dma_mask);
774 if (unlikely(address == bad_dma_address)) 946 if (unlikely(address == bad_dma_address))
775 goto out; 947 goto out;
776 948
@@ -782,6 +954,12 @@ static dma_addr_t __map_single(struct device *dev,
782 } 954 }
783 address += offset; 955 address += offset;
784 956
957 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
958 iommu_flush_tlb(iommu, dma_dom->domain.id);
959 dma_dom->need_flush = false;
960 } else if (unlikely(iommu_has_npcache(iommu)))
961 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
962
785out: 963out:
786 return address; 964 return address;
787} 965}
@@ -812,6 +990,9 @@ static void __unmap_single(struct amd_iommu *iommu,
812 } 990 }
813 991
814 dma_ops_free_addresses(dma_dom, dma_addr, pages); 992 dma_ops_free_addresses(dma_dom, dma_addr, pages);
993
994 if (amd_iommu_unmap_flush)
995 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
815} 996}
816 997
817/* 998/*
@@ -825,6 +1006,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
825 struct protection_domain *domain; 1006 struct protection_domain *domain;
826 u16 devid; 1007 u16 devid;
827 dma_addr_t addr; 1008 dma_addr_t addr;
1009 u64 dma_mask;
1010
1011 if (!check_device(dev))
1012 return bad_dma_address;
1013
1014 dma_mask = *dev->dma_mask;
828 1015
829 get_device_resources(dev, &iommu, &domain, &devid); 1016 get_device_resources(dev, &iommu, &domain, &devid);
830 1017
@@ -833,14 +1020,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
833 return (dma_addr_t)paddr; 1020 return (dma_addr_t)paddr;
834 1021
835 spin_lock_irqsave(&domain->lock, flags); 1022 spin_lock_irqsave(&domain->lock, flags);
836 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1023 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1024 dma_mask);
837 if (addr == bad_dma_address) 1025 if (addr == bad_dma_address)
838 goto out; 1026 goto out;
839 1027
840 if (iommu_has_npcache(iommu)) 1028 if (unlikely(iommu->need_sync))
841 iommu_flush_pages(iommu, domain->id, addr, size);
842
843 if (iommu->need_sync)
844 iommu_completion_wait(iommu); 1029 iommu_completion_wait(iommu);
845 1030
846out: 1031out:
@@ -860,7 +1045,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
860 struct protection_domain *domain; 1045 struct protection_domain *domain;
861 u16 devid; 1046 u16 devid;
862 1047
863 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1048 if (!check_device(dev) ||
1049 !get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */ 1050 /* device not handled by any AMD IOMMU */
865 return; 1051 return;
866 1052
@@ -868,9 +1054,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
868 1054
869 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1055 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
870 1056
871 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1057 if (unlikely(iommu->need_sync))
872
873 if (iommu->need_sync)
874 iommu_completion_wait(iommu); 1058 iommu_completion_wait(iommu);
875 1059
876 spin_unlock_irqrestore(&domain->lock, flags); 1060 spin_unlock_irqrestore(&domain->lock, flags);
@@ -909,6 +1093,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
909 struct scatterlist *s; 1093 struct scatterlist *s;
910 phys_addr_t paddr; 1094 phys_addr_t paddr;
911 int mapped_elems = 0; 1095 int mapped_elems = 0;
1096 u64 dma_mask;
1097
1098 if (!check_device(dev))
1099 return 0;
1100
1101 dma_mask = *dev->dma_mask;
912 1102
913 get_device_resources(dev, &iommu, &domain, &devid); 1103 get_device_resources(dev, &iommu, &domain, &devid);
914 1104
@@ -921,19 +1111,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
921 paddr = sg_phys(s); 1111 paddr = sg_phys(s);
922 1112
923 s->dma_address = __map_single(dev, iommu, domain->priv, 1113 s->dma_address = __map_single(dev, iommu, domain->priv,
924 paddr, s->length, dir); 1114 paddr, s->length, dir, false,
1115 dma_mask);
925 1116
926 if (s->dma_address) { 1117 if (s->dma_address) {
927 s->dma_length = s->length; 1118 s->dma_length = s->length;
928 mapped_elems++; 1119 mapped_elems++;
929 } else 1120 } else
930 goto unmap; 1121 goto unmap;
931 if (iommu_has_npcache(iommu))
932 iommu_flush_pages(iommu, domain->id, s->dma_address,
933 s->dma_length);
934 } 1122 }
935 1123
936 if (iommu->need_sync) 1124 if (unlikely(iommu->need_sync))
937 iommu_completion_wait(iommu); 1125 iommu_completion_wait(iommu);
938 1126
939out: 1127out:
@@ -967,7 +1155,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
967 u16 devid; 1155 u16 devid;
968 int i; 1156 int i;
969 1157
970 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1158 if (!check_device(dev) ||
1159 !get_device_resources(dev, &iommu, &domain, &devid))
971 return; 1160 return;
972 1161
973 spin_lock_irqsave(&domain->lock, flags); 1162 spin_lock_irqsave(&domain->lock, flags);
@@ -975,12 +1164,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
975 for_each_sg(sglist, s, nelems, i) { 1164 for_each_sg(sglist, s, nelems, i) {
976 __unmap_single(iommu, domain->priv, s->dma_address, 1165 __unmap_single(iommu, domain->priv, s->dma_address,
977 s->dma_length, dir); 1166 s->dma_length, dir);
978 iommu_flush_pages(iommu, domain->id, s->dma_address,
979 s->dma_length);
980 s->dma_address = s->dma_length = 0; 1167 s->dma_address = s->dma_length = 0;
981 } 1168 }
982 1169
983 if (iommu->need_sync) 1170 if (unlikely(iommu->need_sync))
984 iommu_completion_wait(iommu); 1171 iommu_completion_wait(iommu);
985 1172
986 spin_unlock_irqrestore(&domain->lock, flags); 1173 spin_unlock_irqrestore(&domain->lock, flags);
@@ -998,25 +1185,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
998 struct protection_domain *domain; 1185 struct protection_domain *domain;
999 u16 devid; 1186 u16 devid;
1000 phys_addr_t paddr; 1187 phys_addr_t paddr;
1188 u64 dma_mask = dev->coherent_dma_mask;
1189
1190 if (!check_device(dev))
1191 return NULL;
1001 1192
1193 if (!get_device_resources(dev, &iommu, &domain, &devid))
1194 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
1195
1196 flag |= __GFP_ZERO;
1002 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1197 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1003 if (!virt_addr) 1198 if (!virt_addr)
1004 return 0; 1199 return 0;
1005 1200
1006 memset(virt_addr, 0, size);
1007 paddr = virt_to_phys(virt_addr); 1201 paddr = virt_to_phys(virt_addr);
1008 1202
1009 get_device_resources(dev, &iommu, &domain, &devid);
1010
1011 if (!iommu || !domain) { 1203 if (!iommu || !domain) {
1012 *dma_addr = (dma_addr_t)paddr; 1204 *dma_addr = (dma_addr_t)paddr;
1013 return virt_addr; 1205 return virt_addr;
1014 } 1206 }
1015 1207
1208 if (!dma_mask)
1209 dma_mask = *dev->dma_mask;
1210
1016 spin_lock_irqsave(&domain->lock, flags); 1211 spin_lock_irqsave(&domain->lock, flags);
1017 1212
1018 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1213 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1019 size, DMA_BIDIRECTIONAL); 1214 size, DMA_BIDIRECTIONAL, true, dma_mask);
1020 1215
1021 if (*dma_addr == bad_dma_address) { 1216 if (*dma_addr == bad_dma_address) {
1022 free_pages((unsigned long)virt_addr, get_order(size)); 1217 free_pages((unsigned long)virt_addr, get_order(size));
@@ -1024,10 +1219,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1024 goto out; 1219 goto out;
1025 } 1220 }
1026 1221
1027 if (iommu_has_npcache(iommu)) 1222 if (unlikely(iommu->need_sync))
1028 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
1029
1030 if (iommu->need_sync)
1031 iommu_completion_wait(iommu); 1223 iommu_completion_wait(iommu);
1032 1224
1033out: 1225out:
@@ -1038,8 +1230,6 @@ out:
1038 1230
1039/* 1231/*
1040 * The exported free_coherent function for dma_ops. 1232 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */ 1233 */
1044static void free_coherent(struct device *dev, size_t size, 1234static void free_coherent(struct device *dev, size_t size,
1045 void *virt_addr, dma_addr_t dma_addr) 1235 void *virt_addr, dma_addr_t dma_addr)
@@ -1049,6 +1239,9 @@ static void free_coherent(struct device *dev, size_t size,
1049 struct protection_domain *domain; 1239 struct protection_domain *domain;
1050 u16 devid; 1240 u16 devid;
1051 1241
1242 if (!check_device(dev))
1243 return;
1244
1052 get_device_resources(dev, &iommu, &domain, &devid); 1245 get_device_resources(dev, &iommu, &domain, &devid);
1053 1246
1054 if (!iommu || !domain) 1247 if (!iommu || !domain)
@@ -1057,9 +1250,8 @@ static void free_coherent(struct device *dev, size_t size,
1057 spin_lock_irqsave(&domain->lock, flags); 1250 spin_lock_irqsave(&domain->lock, flags);
1058 1251
1059 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1252 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1060 iommu_flush_pages(iommu, domain->id, dma_addr, size);
1061 1253
1062 if (iommu->need_sync) 1254 if (unlikely(iommu->need_sync))
1063 iommu_completion_wait(iommu); 1255 iommu_completion_wait(iommu);
1064 1256
1065 spin_unlock_irqrestore(&domain->lock, flags); 1257 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1069,6 +1261,30 @@ free_mem:
1069} 1261}
1070 1262
1071/* 1263/*
1264 * This function is called by the DMA layer to find out if we can handle a
1265 * particular device. It is part of the dma_ops.
1266 */
1267static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1268{
1269 u16 bdf;
1270 struct pci_dev *pcidev;
1271
1272 /* No device or no PCI device */
1273 if (!dev || dev->bus != &pci_bus_type)
1274 return 0;
1275
1276 pcidev = to_pci_dev(dev);
1277
1278 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1279
1280 /* Out of our scope? */
1281 if (bdf > amd_iommu_last_bdf)
1282 return 0;
1283
1284 return 1;
1285}
1286
1287/*
1072 * The function for pre-allocating protection domains. 1288 * The function for pre-allocating protection domains.
1073 * 1289 *
1074 * If the driver core informs the DMA layer if a driver grabs a device 1290 * If the driver core informs the DMA layer if a driver grabs a device
@@ -1097,10 +1313,9 @@ void prealloc_protection_domains(void)
1097 if (!dma_dom) 1313 if (!dma_dom)
1098 continue; 1314 continue;
1099 init_unity_mappings_for_device(dma_dom, devid); 1315 init_unity_mappings_for_device(dma_dom, devid);
1100 set_device_domain(iommu, &dma_dom->domain, devid); 1316 dma_dom->target_dev = devid;
1101 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1317
1102 dma_dom->domain.id); 1318 list_add_tail(&dma_dom->list, &iommu_pd_list);
1103 print_devid(devid, 1);
1104 } 1319 }
1105} 1320}
1106 1321
@@ -1111,6 +1326,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
1111 .unmap_single = unmap_single, 1326 .unmap_single = unmap_single,
1112 .map_sg = map_sg, 1327 .map_sg = map_sg,
1113 .unmap_sg = unmap_sg, 1328 .unmap_sg = unmap_sg,
1329 .dma_supported = amd_iommu_dma_supported,
1114}; 1330};
1115 1331
1116/* 1332/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a69cc0f52042..148fcfe22f17 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define PCI_BUS(x) (((x) >> 8) & 0xff)
34#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
35 36
36#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
121 we find in ACPI */ 122 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */ 124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
124 126
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */ 128 system */
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
234{ 236{
235 u32 ctrl; 237 u32 ctrl;
236 238
237 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
238 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
240} 242}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242/* Function to enable the hardware */ 244/* Function to enable the hardware */
243void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
244{ 246{
245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
246 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
247 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
248 253
249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
250} 255}
251 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
252/* 264/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in 265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one. 266 * the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
286 ****************************************************************************/ 298 ****************************************************************************/
287 299
288/* 300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
289 * This function reads the last device id the IOMMU has to handle from the PCI 309 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU 310 * capability header for this IOMMU
291 */ 311 */
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
329 default: 349 default:
330 break; 350 break;
331 } 351 }
332 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
333 } 353 }
334 354
335 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
414 434
415static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
416{ 436{
417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); 437 free_pages((unsigned long)iommu->cmd_buf,
438 get_order(iommu->cmd_buf_size));
439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
418} 463}
419 464
420/* sets a specific bit in the device table entry. */ 465/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
487 */ 532 */
488static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
489{ 534{
490 int bus = PCI_BUS(iommu->devid);
491 int dev = PCI_SLOT(iommu->devid);
492 int fn = PCI_FUNC(iommu->devid);
493 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
494 u32 range; 536 u32 range, misc;
495 537
496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
539 &iommu->cap);
540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
541 &range);
542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
497 544
498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
499 iommu->first_device = calc_devid(MMIO_GET_BUS(range), 545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
500 MMIO_GET_FD(range)); 546 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range), 547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range)); 548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
503} 550}
504 551
505/* 552/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
604 break; 651 break;
605 } 652 }
606 653
607 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
608 } 655 }
609} 656}
610 657
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
622static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
623{ 670{
624 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
625 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
626} 674}
627 675
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
649 /* 697 /*
650 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
651 */ 699 */
652 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
653 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
654 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
655 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
656 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -661,10 +713,18 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
661 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
662 return -ENOMEM; 714 return -ENOMEM;
663 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
664 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
665 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
666 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
667 725
726 pci_enable_device(iommu->dev);
727
668 return 0; 728 return 0;
669} 729}
670 730
@@ -706,6 +766,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
706 766
707/**************************************************************************** 767/****************************************************************************
708 * 768 *
769 * The following functions initialize the MSI interrupts for all IOMMUs
770 * in the system. Its a bit challenging because there could be multiple
771 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
772 * pci_dev.
773 *
774 ****************************************************************************/
775
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{
818 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826
827 if (pci_enable_msi(iommu->dev))
828 return 1;
829
830 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
831 IRQF_SAMPLE_RANDOM,
832 "AMD IOMMU",
833 NULL);
834
835 if (r) {
836 pci_disable_msi(iommu->dev);
837 return 1;
838 }
839
840 return 0;
841}
842
843static int __init iommu_init_msi(struct amd_iommu *iommu)
844{
845 if (iommu->int_enabled)
846 return 0;
847
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu);
852
853 return 1;
854}
855
856/****************************************************************************
857 *
709 * The next functions belong to the third pass of parsing the ACPI 858 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are 859 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges). 860 * gathered (like exclusion and unity mapping reanges).
@@ -811,7 +960,6 @@ static void init_device_table(void)
811 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { 960 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
812 set_dev_entry_bit(devid, DEV_ENTRY_VALID); 961 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
813 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); 962 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
814 set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
815 } 963 }
816} 964}
817 965
@@ -825,6 +973,8 @@ static void __init enable_iommus(void)
825 973
826 list_for_each_entry(iommu, &amd_iommu_list, list) { 974 list_for_each_entry(iommu, &amd_iommu_list, list) {
827 iommu_set_exclusion_range(iommu); 975 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
828 iommu_enable(iommu); 978 iommu_enable(iommu);
829 } 979 }
830} 980}
@@ -995,11 +1145,17 @@ int __init amd_iommu_init(void)
995 else 1145 else
996 printk("disabled\n"); 1146 printk("disabled\n");
997 1147
1148 if (amd_iommu_unmap_flush)
1149 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1150 else
1151 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1152
998out: 1153out:
999 return ret; 1154 return ret;
1000 1155
1001free: 1156free:
1002 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1157 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1158 get_order(MAX_DOMAIN_ID/8));
1003 1159
1004 free_pages((unsigned long)amd_iommu_pd_table, 1160 free_pages((unsigned long)amd_iommu_pd_table,
1005 get_order(rlookup_table_size)); 1161 get_order(rlookup_table_size));
@@ -1057,8 +1213,10 @@ void __init amd_iommu_detect(void)
1057static int __init parse_amd_iommu_options(char *str) 1213static int __init parse_amd_iommu_options(char *str)
1058{ 1214{
1059 for (; *str; ++str) { 1215 for (; *str; ++str) {
1060 if (strcmp(str, "isolate") == 0) 1216 if (strncmp(str, "isolate", 7) == 0)
1061 amd_iommu_isolate = 1; 1217 amd_iommu_isolate = 1;
1218 if (strncmp(str, "fullflush", 11) == 0)
1219 amd_iommu_unmap_flush = true;
1062 } 1220 }
1063 1221
1064 return 1; 1222 return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db11..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
455 force_iommu || 455 force_iommu ||
456 valid_agp || 456 valid_agp ||
457 fallback_aper_force) { 457 fallback_aper_force) {
458 printk(KERN_ERR 458 printk(KERN_INFO
459 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
460 printk(KERN_ERR 460 printk(KERN_INFO
461 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
462 printk(KERN_ERR 462 printk(KERN_INFO
463 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
464 32 << fallback_aper_order); 464 32 << fallback_aper_order);
465 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 44cae65e32ef..a91c57cb666a 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -60,10 +60,8 @@ unsigned long mp_lapic_addr;
60static int force_enable_local_apic; 60static int force_enable_local_apic;
61int disable_apic; 61int disable_apic;
62 62
63/* Local APIC timer verification ok */
64static int local_apic_timer_verify_ok;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 63/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 64static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 65/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 66int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 67EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -130,7 +128,11 @@ static inline int lapic_get_version(void)
130 */ 128 */
131static inline int lapic_is_integrated(void) 129static inline int lapic_is_integrated(void)
132{ 130{
131#ifdef CONFIG_X86_64
132 return 1;
133#else
133 return APIC_INTEGRATED(lapic_get_version()); 134 return APIC_INTEGRATED(lapic_get_version());
135#endif
134} 136}
135 137
136/* 138/*
@@ -205,11 +207,15 @@ EXPORT_SYMBOL_GPL(apic_ops);
205 */ 207 */
206void __cpuinit enable_NMI_through_LVT0(void) 208void __cpuinit enable_NMI_through_LVT0(void)
207{ 209{
208 unsigned int v = APIC_DM_NMI; 210 unsigned int v;
209 211
210 /* Level triggered for 82489DX */ 212 /* unmask and set to NMI */
213 v = APIC_DM_NMI;
214
215 /* Level triggered for 82489DX (32bit mode) */
211 if (!lapic_is_integrated()) 216 if (!lapic_is_integrated())
212 v |= APIC_LVT_LEVEL_TRIGGER; 217 v |= APIC_LVT_LEVEL_TRIGGER;
218
213 apic_write(APIC_LVT0, v); 219 apic_write(APIC_LVT0, v);
214} 220}
215 221
@@ -226,9 +232,13 @@ int get_physical_broadcast(void)
226 */ 232 */
227int lapic_get_maxlvt(void) 233int lapic_get_maxlvt(void)
228{ 234{
229 unsigned int v = apic_read(APIC_LVR); 235 unsigned int v;
230 236
231 /* 82489DXs do not report # of LVT entries. */ 237 v = apic_read(APIC_LVR);
238 /*
239 * - we always have APIC integrated on 64bit mode
240 * - 82489DXs do not report # of LVT entries
241 */
232 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; 242 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
233} 243}
234 244
@@ -236,8 +246,12 @@ int lapic_get_maxlvt(void)
236 * Local APIC timer 246 * Local APIC timer
237 */ 247 */
238 248
239/* Clock divisor is set to 16 */ 249/* Clock divisor */
250#ifdef CONFG_X86_64
251#define APIC_DIVISOR 1
252#else
240#define APIC_DIVISOR 16 253#define APIC_DIVISOR 16
254#endif
241 255
242/* 256/*
243 * This function sets up the local APIC timer, with a timeout of 257 * This function sets up the local APIC timer, with a timeout of
@@ -245,6 +259,9 @@ int lapic_get_maxlvt(void)
245 * this function twice on the boot CPU, once with a bogus timeout 259 * this function twice on the boot CPU, once with a bogus timeout
246 * value, second time for real. The other (noncalibrating) CPUs 260 * value, second time for real. The other (noncalibrating) CPUs
247 * call this function only once, with the real, calibrated value. 261 * call this function only once, with the real, calibrated value.
262 *
263 * We do reads before writes even if unnecessary, to get around the
264 * P5 APIC double write bug.
248 */ 265 */
249static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 266static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
250{ 267{
@@ -266,14 +283,44 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
266 */ 283 */
267 tmp_value = apic_read(APIC_TDCR); 284 tmp_value = apic_read(APIC_TDCR);
268 apic_write(APIC_TDCR, 285 apic_write(APIC_TDCR,
269 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 286 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
270 APIC_TDR_DIV_16); 287 APIC_TDR_DIV_16);
271 288
272 if (!oneshot) 289 if (!oneshot)
273 apic_write(APIC_TMICT, clocks / APIC_DIVISOR); 290 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
274} 291}
275 292
276/* 293/*
294 * Setup extended LVT, AMD specific (K8, family 10h)
295 *
296 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
297 * MCE interrupts are supported. Thus MCE offset must be set to 0.
298 */
299
300#define APIC_EILVT_LVTOFF_MCE 0
301#define APIC_EILVT_LVTOFF_IBS 1
302
303static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
304{
305 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
306 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
307
308 apic_write(reg, v);
309}
310
311u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
312{
313 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
314 return APIC_EILVT_LVTOFF_MCE;
315}
316
317u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
318{
319 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
320 return APIC_EILVT_LVTOFF_IBS;
321}
322
323/*
277 * Program the next event, relative to now 324 * Program the next event, relative to now
278 */ 325 */
279static int lapic_next_event(unsigned long delta, 326static int lapic_next_event(unsigned long delta,
@@ -292,8 +339,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
292 unsigned long flags; 339 unsigned long flags;
293 unsigned int v; 340 unsigned int v;
294 341
295 /* Lapic used for broadcast ? */ 342 /* Lapic used as dummy for broadcast ? */
296 if (!local_apic_timer_verify_ok) 343 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
297 return; 344 return;
298 345
299 local_irq_save(flags); 346 local_irq_save(flags);
@@ -506,7 +553,7 @@ static int __init calibrate_APIC_clock(void)
506 return -1; 553 return -1;
507 } 554 }
508 555
509 local_apic_timer_verify_ok = 1; 556 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
510 557
511 /* We trust the pm timer based calibration */ 558 /* We trust the pm timer based calibration */
512 if (!pm_referenced) { 559 if (!pm_referenced) {
@@ -540,11 +587,11 @@ static int __init calibrate_APIC_clock(void)
540 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 587 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
541 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 588 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
542 else 589 else
543 local_apic_timer_verify_ok = 0; 590 levt->features |= CLOCK_EVT_FEAT_DUMMY;
544 } else 591 } else
545 local_irq_enable(); 592 local_irq_enable();
546 593
547 if (!local_apic_timer_verify_ok) { 594 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
548 printk(KERN_WARNING 595 printk(KERN_WARNING
549 "APIC timer disabled due to verification failure.\n"); 596 "APIC timer disabled due to verification failure.\n");
550 return -1; 597 return -1;
@@ -566,7 +613,8 @@ void __init setup_boot_APIC_clock(void)
566 * timer as a dummy clock event source on SMP systems, so the 613 * timer as a dummy clock event source on SMP systems, so the
567 * broadcast mechanism is used. On UP systems simply ignore it. 614 * broadcast mechanism is used. On UP systems simply ignore it.
568 */ 615 */
569 if (local_apic_timer_disabled) { 616 if (disable_apic_timer) {
617 printk(KERN_INFO "Disabling APIC timer\n");
570 /* No broadcast on UP ! */ 618 /* No broadcast on UP ! */
571 if (num_possible_cpus() > 1) { 619 if (num_possible_cpus() > 1) {
572 lapic_clockevent.mult = 1; 620 lapic_clockevent.mult = 1;
@@ -635,7 +683,11 @@ static void local_apic_timer_interrupt(void)
635 /* 683 /*
636 * the NMI deadlock-detector uses this. 684 * the NMI deadlock-detector uses this.
637 */ 685 */
686#ifdef CONFIG_X86_64
687 add_pda(apic_timer_irqs, 1);
688#else
638 per_cpu(irq_stat, cpu).apic_timer_irqs++; 689 per_cpu(irq_stat, cpu).apic_timer_irqs++;
690#endif
639 691
640 evt->event_handler(evt); 692 evt->event_handler(evt);
641} 693}
@@ -675,35 +727,6 @@ int setup_profiling_timer(unsigned int multiplier)
675} 727}
676 728
677/* 729/*
678 * Setup extended LVT, AMD specific (K8, family 10h)
679 *
680 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
681 * MCE interrupts are supported. Thus MCE offset must be set to 0.
682 */
683
684#define APIC_EILVT_LVTOFF_MCE 0
685#define APIC_EILVT_LVTOFF_IBS 1
686
687static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
688{
689 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
690 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
691 apic_write(reg, v);
692}
693
694u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
695{
696 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
697 return APIC_EILVT_LVTOFF_MCE;
698}
699
700u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
701{
702 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
703 return APIC_EILVT_LVTOFF_IBS;
704}
705
706/*
707 * Local APIC start and shutdown 730 * Local APIC start and shutdown
708 */ 731 */
709 732
@@ -748,7 +771,7 @@ void clear_local_APIC(void)
748 } 771 }
749 772
750 /* lets not touch this if we didn't frob it */ 773 /* lets not touch this if we didn't frob it */
751#ifdef CONFIG_X86_MCE_P4THERMAL 774#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
752 if (maxlvt >= 5) { 775 if (maxlvt >= 5) {
753 v = apic_read(APIC_LVTTHMR); 776 v = apic_read(APIC_LVTTHMR);
754 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 777 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -765,10 +788,6 @@ void clear_local_APIC(void)
765 if (maxlvt >= 4) 788 if (maxlvt >= 4)
766 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 789 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
767 790
768#ifdef CONFIG_X86_MCE_P4THERMAL
769 if (maxlvt >= 5)
770 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
771#endif
772 /* Integrated APIC (!82489DX) ? */ 791 /* Integrated APIC (!82489DX) ? */
773 if (lapic_is_integrated()) { 792 if (lapic_is_integrated()) {
774 if (maxlvt > 3) 793 if (maxlvt > 3)
@@ -783,7 +802,7 @@ void clear_local_APIC(void)
783 */ 802 */
784void disable_local_APIC(void) 803void disable_local_APIC(void)
785{ 804{
786 unsigned long value; 805 unsigned int value;
787 806
788 clear_local_APIC(); 807 clear_local_APIC();
789 808
@@ -795,6 +814,7 @@ void disable_local_APIC(void)
795 value &= ~APIC_SPIV_APIC_ENABLED; 814 value &= ~APIC_SPIV_APIC_ENABLED;
796 apic_write(APIC_SPIV, value); 815 apic_write(APIC_SPIV, value);
797 816
817#ifdef CONFIG_X86_32
798 /* 818 /*
799 * When LAPIC was disabled by the BIOS and enabled by the kernel, 819 * When LAPIC was disabled by the BIOS and enabled by the kernel,
800 * restore the disabled state. 820 * restore the disabled state.
@@ -806,6 +826,7 @@ void disable_local_APIC(void)
806 l &= ~MSR_IA32_APICBASE_ENABLE; 826 l &= ~MSR_IA32_APICBASE_ENABLE;
807 wrmsr(MSR_IA32_APICBASE, l, h); 827 wrmsr(MSR_IA32_APICBASE, l, h);
808 } 828 }
829#endif
809} 830}
810 831
811/* 832/*
@@ -822,11 +843,15 @@ void lapic_shutdown(void)
822 return; 843 return;
823 844
824 local_irq_save(flags); 845 local_irq_save(flags);
825 clear_local_APIC();
826 846
827 if (enabled_via_apicbase) 847#ifdef CONFIG_X86_32
848 if (!enabled_via_apicbase)
849 clear_local_APIC();
850 else
851#endif
828 disable_local_APIC(); 852 disable_local_APIC();
829 853
854
830 local_irq_restore(flags); 855 local_irq_restore(flags);
831} 856}
832 857
@@ -871,6 +896,12 @@ int __init verify_local_APIC(void)
871 */ 896 */
872 reg0 = apic_read(APIC_ID); 897 reg0 = apic_read(APIC_ID);
873 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 898 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
899 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
900 reg1 = apic_read(APIC_ID);
901 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
902 apic_write(APIC_ID, reg0);
903 if (reg1 != (reg0 ^ APIC_ID_MASK))
904 return 0;
874 905
875 /* 906 /*
876 * The next two are just to see if we have sane values. 907 * The next two are just to see if we have sane values.
@@ -896,14 +927,15 @@ void __init sync_Arb_IDs(void)
896 */ 927 */
897 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 928 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
898 return; 929 return;
930
899 /* 931 /*
900 * Wait for idle. 932 * Wait for idle.
901 */ 933 */
902 apic_wait_icr_idle(); 934 apic_wait_icr_idle();
903 935
904 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 936 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
905 apic_write(APIC_ICR, 937 apic_write(APIC_ICR, APIC_DEST_ALLINC |
906 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); 938 APIC_INT_LEVELTRIG | APIC_DM_INIT);
907} 939}
908 940
909/* 941/*
@@ -911,7 +943,7 @@ void __init sync_Arb_IDs(void)
911 */ 943 */
912void __init init_bsp_APIC(void) 944void __init init_bsp_APIC(void)
913{ 945{
914 unsigned long value; 946 unsigned int value;
915 947
916 /* 948 /*
917 * Don't do the setup now if we have a SMP BIOS as the 949 * Don't do the setup now if we have a SMP BIOS as the
@@ -932,11 +964,13 @@ void __init init_bsp_APIC(void)
932 value &= ~APIC_VECTOR_MASK; 964 value &= ~APIC_VECTOR_MASK;
933 value |= APIC_SPIV_APIC_ENABLED; 965 value |= APIC_SPIV_APIC_ENABLED;
934 966
967#ifdef CONFIG_X86_32
935 /* This bit is reserved on P4/Xeon and should be cleared */ 968 /* This bit is reserved on P4/Xeon and should be cleared */
936 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 969 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
937 (boot_cpu_data.x86 == 15)) 970 (boot_cpu_data.x86 == 15))
938 value &= ~APIC_SPIV_FOCUS_DISABLED; 971 value &= ~APIC_SPIV_FOCUS_DISABLED;
939 else 972 else
973#endif
940 value |= APIC_SPIV_FOCUS_DISABLED; 974 value |= APIC_SPIV_FOCUS_DISABLED;
941 value |= SPURIOUS_APIC_VECTOR; 975 value |= SPURIOUS_APIC_VECTOR;
942 apic_write(APIC_SPIV, value); 976 apic_write(APIC_SPIV, value);
@@ -955,6 +989,16 @@ static void __cpuinit lapic_setup_esr(void)
955{ 989{
956 unsigned long oldvalue, value, maxlvt; 990 unsigned long oldvalue, value, maxlvt;
957 if (lapic_is_integrated() && !esr_disable) { 991 if (lapic_is_integrated() && !esr_disable) {
992 if (esr_disable) {
993 /*
994 * Something untraceable is creating bad interrupts on
995 * secondary quads ... for the moment, just leave the
996 * ESR disabled - we can't do anything useful with the
997 * errors anyway - mbligh
998 */
999 printk(KERN_INFO "Leaving ESR disabled.\n");
1000 return;
1001 }
958 /* !82489DX */ 1002 /* !82489DX */
959 maxlvt = lapic_get_maxlvt(); 1003 maxlvt = lapic_get_maxlvt();
960 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1004 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -975,16 +1019,7 @@ static void __cpuinit lapic_setup_esr(void)
975 "vector: 0x%08lx after: 0x%08lx\n", 1019 "vector: 0x%08lx after: 0x%08lx\n",
976 oldvalue, value); 1020 oldvalue, value);
977 } else { 1021 } else {
978 if (esr_disable) 1022 printk(KERN_INFO "No ESR for 82489DX.\n");
979 /*
980 * Something untraceable is creating bad interrupts on
981 * secondary quads ... for the moment, just leave the
982 * ESR disabled - we can't do anything useful with the
983 * errors anyway - mbligh
984 */
985 printk(KERN_INFO "Leaving ESR disabled.\n");
986 else
987 printk(KERN_INFO "No ESR for 82489DX.\n");
988 } 1023 }
989} 1024}
990 1025
@@ -1122,13 +1157,17 @@ void __cpuinit setup_local_APIC(void)
1122 1157
1123void __cpuinit end_local_APIC_setup(void) 1158void __cpuinit end_local_APIC_setup(void)
1124{ 1159{
1125 unsigned long value;
1126
1127 lapic_setup_esr(); 1160 lapic_setup_esr();
1128 /* Disable the local apic timer */ 1161
1129 value = apic_read(APIC_LVTT); 1162#ifdef CONFIG_X86_32
1130 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1163 {
1131 apic_write(APIC_LVTT, value); 1164 unsigned int value;
1165 /* Disable the local apic timer */
1166 value = apic_read(APIC_LVTT);
1167 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1168 apic_write(APIC_LVTT, value);
1169 }
1170#endif
1132 1171
1133 setup_apic_nmi_watchdog(NULL); 1172 setup_apic_nmi_watchdog(NULL);
1134 apic_pm_activate(); 1173 apic_pm_activate();
@@ -1354,59 +1393,12 @@ void smp_error_interrupt(struct pt_regs *regs)
1354 irq_exit(); 1393 irq_exit();
1355} 1394}
1356 1395
1357#ifdef CONFIG_SMP
1358void __init smp_intr_init(void)
1359{
1360 /*
1361 * IRQ0 must be given a fixed assignment and initialized,
1362 * because it's used before the IO-APIC is set up.
1363 */
1364 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1365
1366 /*
1367 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1368 * IPI, driven by wakeup.
1369 */
1370 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1371
1372 /* IPI for invalidation */
1373 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1374
1375 /* IPI for generic function call */
1376 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1377
1378 /* IPI for single call function */
1379 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1380 call_function_single_interrupt);
1381}
1382#endif
1383
1384/*
1385 * Initialize APIC interrupts
1386 */
1387void __init apic_intr_init(void)
1388{
1389#ifdef CONFIG_SMP
1390 smp_intr_init();
1391#endif
1392 /* self generated IPI for local APIC timer */
1393 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1394
1395 /* IPI vectors for APIC spurious and error interrupts */
1396 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1397 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1398
1399 /* thermal monitor LVT interrupt */
1400#ifdef CONFIG_X86_MCE_P4THERMAL
1401 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1402#endif
1403}
1404
1405/** 1396/**
1406 * connect_bsp_APIC - attach the APIC to the interrupt system 1397 * connect_bsp_APIC - attach the APIC to the interrupt system
1407 */ 1398 */
1408void __init connect_bsp_APIC(void) 1399void __init connect_bsp_APIC(void)
1409{ 1400{
1401#ifdef CONFIG_X86_32
1410 if (pic_mode) { 1402 if (pic_mode) {
1411 /* 1403 /*
1412 * Do not trust the local APIC being empty at bootup. 1404 * Do not trust the local APIC being empty at bootup.
@@ -1421,6 +1413,7 @@ void __init connect_bsp_APIC(void)
1421 outb(0x70, 0x22); 1413 outb(0x70, 0x22);
1422 outb(0x01, 0x23); 1414 outb(0x01, 0x23);
1423 } 1415 }
1416#endif
1424 enable_apic_mode(); 1417 enable_apic_mode();
1425} 1418}
1426 1419
@@ -1433,6 +1426,9 @@ void __init connect_bsp_APIC(void)
1433 */ 1426 */
1434void disconnect_bsp_APIC(int virt_wire_setup) 1427void disconnect_bsp_APIC(int virt_wire_setup)
1435{ 1428{
1429 unsigned int value;
1430
1431#ifdef CONFIG_X86_32
1436 if (pic_mode) { 1432 if (pic_mode) {
1437 /* 1433 /*
1438 * Put the board back into PIC mode (has an effect only on 1434 * Put the board back into PIC mode (has an effect only on
@@ -1444,54 +1440,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1444 "entering PIC mode.\n"); 1440 "entering PIC mode.\n");
1445 outb(0x70, 0x22); 1441 outb(0x70, 0x22);
1446 outb(0x00, 0x23); 1442 outb(0x00, 0x23);
1447 } else { 1443 return;
1448 /* Go back to Virtual Wire compatibility mode */ 1444 }
1449 unsigned long value; 1445#endif
1450 1446
1451 /* For the spurious interrupt use vector F, and enable it */ 1447 /* Go back to Virtual Wire compatibility mode */
1452 value = apic_read(APIC_SPIV);
1453 value &= ~APIC_VECTOR_MASK;
1454 value |= APIC_SPIV_APIC_ENABLED;
1455 value |= 0xf;
1456 apic_write(APIC_SPIV, value);
1457 1448
1458 if (!virt_wire_setup) { 1449 /* For the spurious interrupt use vector F, and enable it */
1459 /* 1450 value = apic_read(APIC_SPIV);
1460 * For LVT0 make it edge triggered, active high, 1451 value &= ~APIC_VECTOR_MASK;
1461 * external and enabled 1452 value |= APIC_SPIV_APIC_ENABLED;
1462 */ 1453 value |= 0xf;
1463 value = apic_read(APIC_LVT0); 1454 apic_write(APIC_SPIV, value);
1464 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1465 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1466 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1467 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1468 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1469 apic_write(APIC_LVT0, value);
1470 } else {
1471 /* Disable LVT0 */
1472 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1473 }
1474 1455
1456 if (!virt_wire_setup) {
1475 /* 1457 /*
1476 * For LVT1 make it edge triggered, active high, nmi and 1458 * For LVT0 make it edge triggered, active high,
1477 * enabled 1459 * external and enabled
1478 */ 1460 */
1479 value = apic_read(APIC_LVT1); 1461 value = apic_read(APIC_LVT0);
1480 value &= ~( 1462 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1481 APIC_MODE_MASK | APIC_SEND_PENDING |
1482 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1463 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1483 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1464 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1484 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1465 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1485 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1466 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1486 apic_write(APIC_LVT1, value); 1467 apic_write(APIC_LVT0, value);
1468 } else {
1469 /* Disable LVT0 */
1470 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1487 } 1471 }
1472
1473 /*
1474 * For LVT1 make it edge triggered, active high,
1475 * nmi and enabled
1476 */
1477 value = apic_read(APIC_LVT1);
1478 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1479 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1480 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1481 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1482 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1483 apic_write(APIC_LVT1, value);
1488} 1484}
1489 1485
1490void __cpuinit generic_processor_info(int apicid, int version) 1486void __cpuinit generic_processor_info(int apicid, int version)
1491{ 1487{
1492 int cpu; 1488 int cpu;
1493 cpumask_t tmp_map; 1489 cpumask_t tmp_map;
1494 physid_mask_t phys_cpu;
1495 1490
1496 /* 1491 /*
1497 * Validate version 1492 * Validate version
@@ -1504,9 +1499,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1504 } 1499 }
1505 apic_version[apicid] = version; 1500 apic_version[apicid] = version;
1506 1501
1507 phys_cpu = apicid_to_cpu_present(apicid);
1508 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1509
1510 if (num_processors >= NR_CPUS) { 1502 if (num_processors >= NR_CPUS) {
1511 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1503 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1512 " Processor ignored.\n", NR_CPUS); 1504 " Processor ignored.\n", NR_CPUS);
@@ -1517,17 +1509,19 @@ void __cpuinit generic_processor_info(int apicid, int version)
1517 cpus_complement(tmp_map, cpu_present_map); 1509 cpus_complement(tmp_map, cpu_present_map);
1518 cpu = first_cpu(tmp_map); 1510 cpu = first_cpu(tmp_map);
1519 1511
1520 if (apicid == boot_cpu_physical_apicid) 1512 physid_set(apicid, phys_cpu_present_map);
1513 if (apicid == boot_cpu_physical_apicid) {
1521 /* 1514 /*
1522 * x86_bios_cpu_apicid is required to have processors listed 1515 * x86_bios_cpu_apicid is required to have processors listed
1523 * in same order as logical cpu numbers. Hence the first 1516 * in same order as logical cpu numbers. Hence the first
1524 * entry is BSP, and so on. 1517 * entry is BSP, and so on.
1525 */ 1518 */
1526 cpu = 0; 1519 cpu = 0;
1527 1520 }
1528 if (apicid > max_physical_apicid) 1521 if (apicid > max_physical_apicid)
1529 max_physical_apicid = apicid; 1522 max_physical_apicid = apicid;
1530 1523
1524#ifdef CONFIG_X86_32
1531 /* 1525 /*
1532 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1526 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1533 * but we need to work other dependencies like SMP_SUSPEND etc 1527 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1547,7 +1541,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1547 def_to_bigsmp = 1; 1541 def_to_bigsmp = 1;
1548 } 1542 }
1549 } 1543 }
1550#ifdef CONFIG_SMP 1544#endif
1545
1546#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1551 /* are we being called early in kernel startup? */ 1547 /* are we being called early in kernel startup? */
1552 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1548 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1553 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1549 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1560,6 +1556,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1560 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1556 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1561 } 1557 }
1562#endif 1558#endif
1559
1563 cpu_set(cpu, cpu_possible_map); 1560 cpu_set(cpu, cpu_possible_map);
1564 cpu_set(cpu, cpu_present_map); 1561 cpu_set(cpu, cpu_present_map);
1565} 1562}
@@ -1570,6 +1567,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
1570#ifdef CONFIG_PM 1567#ifdef CONFIG_PM
1571 1568
1572static struct { 1569static struct {
1570 /*
1571 * 'active' is true if the local APIC was enabled by us and
1572 * not the BIOS; this signifies that we are also responsible
1573 * for disabling it before entering apm/acpi suspend
1574 */
1573 int active; 1575 int active;
1574 /* r/w apic fields */ 1576 /* r/w apic fields */
1575 unsigned int apic_id; 1577 unsigned int apic_id;
@@ -1610,7 +1612,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1610 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1612 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1611 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1613 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1612 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1614 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1613#ifdef CONFIG_X86_MCE_P4THERMAL 1615#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1614 if (maxlvt >= 5) 1616 if (maxlvt >= 5)
1615 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1617 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1616#endif 1618#endif
@@ -1634,16 +1636,23 @@ static int lapic_resume(struct sys_device *dev)
1634 1636
1635 local_irq_save(flags); 1637 local_irq_save(flags);
1636 1638
1637 /* 1639#ifdef CONFIG_X86_64
1638 * Make sure the APICBASE points to the right address 1640 if (x2apic)
1639 * 1641 enable_x2apic();
1640 * FIXME! This will be wrong if we ever support suspend on 1642 else
1641 * SMP! We'll need to do this as part of the CPU restore! 1643#endif
1642 */ 1644 {
1643 rdmsr(MSR_IA32_APICBASE, l, h); 1645 /*
1644 l &= ~MSR_IA32_APICBASE_BASE; 1646 * Make sure the APICBASE points to the right address
1645 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1647 *
1646 wrmsr(MSR_IA32_APICBASE, l, h); 1648 * FIXME! This will be wrong if we ever support suspend on
1649 * SMP! We'll need to do this as part of the CPU restore!
1650 */
1651 rdmsr(MSR_IA32_APICBASE, l, h);
1652 l &= ~MSR_IA32_APICBASE_BASE;
1653 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1654 wrmsr(MSR_IA32_APICBASE, l, h);
1655 }
1647 1656
1648 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1657 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1649 apic_write(APIC_ID, apic_pm_state.apic_id); 1658 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1653,7 +1662,7 @@ static int lapic_resume(struct sys_device *dev)
1653 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1662 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1654 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1663 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1655 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1664 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1656#ifdef CONFIG_X86_MCE_P4THERMAL 1665#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1657 if (maxlvt >= 5) 1666 if (maxlvt >= 5)
1658 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1667 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1659#endif 1668#endif
@@ -1667,7 +1676,9 @@ static int lapic_resume(struct sys_device *dev)
1667 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1676 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1668 apic_write(APIC_ESR, 0); 1677 apic_write(APIC_ESR, 0);
1669 apic_read(APIC_ESR); 1678 apic_read(APIC_ESR);
1679
1670 local_irq_restore(flags); 1680 local_irq_restore(flags);
1681
1671 return 0; 1682 return 0;
1672} 1683}
1673 1684
@@ -1723,20 +1734,20 @@ static int __init parse_lapic(char *arg)
1723} 1734}
1724early_param("lapic", parse_lapic); 1735early_param("lapic", parse_lapic);
1725 1736
1726static int __init parse_nolapic(char *arg) 1737static int __init setup_disableapic(char *arg)
1727{ 1738{
1728 disable_apic = 1; 1739 disable_apic = 1;
1729 setup_clear_cpu_cap(X86_FEATURE_APIC); 1740 setup_clear_cpu_cap(X86_FEATURE_APIC);
1730 return 0; 1741 return 0;
1731} 1742}
1732early_param("nolapic", parse_nolapic); 1743early_param("disableapic", setup_disableapic);
1733 1744
1734static int __init parse_disable_lapic_timer(char *arg) 1745/* same as disableapic, for compatibility */
1746static int __init setup_nolapic(char *arg)
1735{ 1747{
1736 local_apic_timer_disabled = 1; 1748 return setup_disableapic(arg);
1737 return 0;
1738} 1749}
1739early_param("nolapic_timer", parse_disable_lapic_timer); 1750early_param("nolapic", setup_nolapic);
1740 1751
1741static int __init parse_lapic_timer_c2_ok(char *arg) 1752static int __init parse_lapic_timer_c2_ok(char *arg)
1742{ 1753{
@@ -1745,15 +1756,40 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1745} 1756}
1746early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1757early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1747 1758
1759static int __init parse_disable_apic_timer(char *arg)
1760{
1761 disable_apic_timer = 1;
1762 return 0;
1763}
1764early_param("noapictimer", parse_disable_apic_timer);
1765
1766static int __init parse_nolapic_timer(char *arg)
1767{
1768 disable_apic_timer = 1;
1769 return 0;
1770}
1771early_param("nolapic_timer", parse_nolapic_timer);
1772
1748static int __init apic_set_verbosity(char *arg) 1773static int __init apic_set_verbosity(char *arg)
1749{ 1774{
1750 if (!arg) 1775 if (!arg) {
1776#ifdef CONFIG_X86_64
1777 skip_ioapic_setup = 0;
1778 ioapic_force = 1;
1779 return 0;
1780#endif
1751 return -EINVAL; 1781 return -EINVAL;
1782 }
1752 1783
1753 if (strcmp(arg, "debug") == 0) 1784 if (strcmp("debug", arg) == 0)
1754 apic_verbosity = APIC_DEBUG; 1785 apic_verbosity = APIC_DEBUG;
1755 else if (strcmp(arg, "verbose") == 0) 1786 else if (strcmp("verbose", arg) == 0)
1756 apic_verbosity = APIC_VERBOSE; 1787 apic_verbosity = APIC_VERBOSE;
1788 else {
1789 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1790 " use apic=verbose or apic=debug\n", arg);
1791 return -EINVAL;
1792 }
1757 1793
1758 return 0; 1794 return 0;
1759} 1795}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index b6256587f99e..53898b65a6ae 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -45,6 +45,7 @@
45#include <mach_ipi.h> 45#include <mach_ipi.h>
46#include <mach_apic.h> 46#include <mach_apic.h>
47 47
48/* Disable local APIC timer from the kernel commandline or via dmi quirk */
48static int disable_apic_timer __cpuinitdata; 49static int disable_apic_timer __cpuinitdata;
49static int apic_calibrate_pmtmr __initdata; 50static int apic_calibrate_pmtmr __initdata;
50int disable_apic; 51int disable_apic;
@@ -80,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
80static void lapic_timer_broadcast(cpumask_t mask); 81static void lapic_timer_broadcast(cpumask_t mask);
81static void apic_pm_activate(void); 82static void apic_pm_activate(void);
82 83
84/*
85 * The local apic timer can be used for any function which is CPU local.
86 */
83static struct clock_event_device lapic_clockevent = { 87static struct clock_event_device lapic_clockevent = {
84 .name = "lapic", 88 .name = "lapic",
85 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT 89 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -106,11 +110,15 @@ static inline int lapic_get_version(void)
106} 110}
107 111
108/* 112/*
109 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
110 */ 114 */
111static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
112{ 116{
117#ifdef CONFIG_X86_64
113 return 1; 118 return 1;
119#else
120 return APIC_INTEGRATED(lapic_get_version());
121#endif
114} 122}
115 123
116/* 124/*
@@ -125,6 +133,11 @@ static int modern_apic(void)
125 return lapic_get_version() >= 0x14; 133 return lapic_get_version() >= 0x14;
126} 134}
127 135
136/*
137 * Paravirt kernels also might be using these below ops. So we still
138 * use generic apic_read()/apic_write(), which might be pointing to different
139 * ops in PARAVIRT case.
140 */
128void xapic_wait_icr_idle(void) 141void xapic_wait_icr_idle(void)
129{ 142{
130 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 143 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -149,7 +162,7 @@ u32 safe_xapic_wait_icr_idle(void)
149 162
150void xapic_icr_write(u32 low, u32 id) 163void xapic_icr_write(u32 low, u32 id)
151{ 164{
152 apic_write(APIC_ICR2, id << 24); 165 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
153 apic_write(APIC_ICR, low); 166 apic_write(APIC_ICR, low);
154} 167}
155 168
@@ -160,7 +173,7 @@ u64 xapic_icr_read(void)
160 icr2 = apic_read(APIC_ICR2); 173 icr2 = apic_read(APIC_ICR2);
161 icr1 = apic_read(APIC_ICR); 174 icr1 = apic_read(APIC_ICR);
162 175
163 return (icr1 | ((u64)icr2 << 32)); 176 return icr1 | ((u64)icr2 << 32);
164} 177}
165 178
166static struct apic_ops xapic_ops = { 179static struct apic_ops xapic_ops = {
@@ -173,7 +186,6 @@ static struct apic_ops xapic_ops = {
173}; 186};
174 187
175struct apic_ops __read_mostly *apic_ops = &xapic_ops; 188struct apic_ops __read_mostly *apic_ops = &xapic_ops;
176
177EXPORT_SYMBOL_GPL(apic_ops); 189EXPORT_SYMBOL_GPL(apic_ops);
178 190
179static void x2apic_wait_icr_idle(void) 191static void x2apic_wait_icr_idle(void)
@@ -219,6 +231,11 @@ void __cpuinit enable_NMI_through_LVT0(void)
219 231
220 /* unmask and set to NMI */ 232 /* unmask and set to NMI */
221 v = APIC_DM_NMI; 233 v = APIC_DM_NMI;
234
235 /* Level triggered for 82489DX (32bit mode) */
236 if (!lapic_is_integrated())
237 v |= APIC_LVT_LEVEL_TRIGGER;
238
222 apic_write(APIC_LVT0, v); 239 apic_write(APIC_LVT0, v);
223} 240}
224 241
@@ -227,14 +244,28 @@ void __cpuinit enable_NMI_through_LVT0(void)
227 */ 244 */
228int lapic_get_maxlvt(void) 245int lapic_get_maxlvt(void)
229{ 246{
230 unsigned int v, maxlvt; 247 unsigned int v;
231 248
232 v = apic_read(APIC_LVR); 249 v = apic_read(APIC_LVR);
233 maxlvt = GET_APIC_MAXLVT(v); 250 /*
234 return maxlvt; 251 * - we always have APIC integrated on 64bit mode
252 * - 82489DXs do not report # of LVT entries
253 */
254 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
235} 255}
236 256
237/* 257/*
258 * Local APIC timer
259 */
260
261/* Clock divisor */
262#ifdef CONFG_X86_64
263#define APIC_DIVISOR 1
264#else
265#define APIC_DIVISOR 16
266#endif
267
268/*
238 * This function sets up the local APIC timer, with a timeout of 269 * This function sets up the local APIC timer, with a timeout of
239 * 'clocks' APIC bus clock. During calibration we actually call 270 * 'clocks' APIC bus clock. During calibration we actually call
240 * this function twice on the boot CPU, once with a bogus timeout 271 * this function twice on the boot CPU, once with a bogus timeout
@@ -244,7 +275,6 @@ int lapic_get_maxlvt(void)
244 * We do reads before writes even if unnecessary, to get around the 275 * We do reads before writes even if unnecessary, to get around the
245 * P5 APIC double write bug. 276 * P5 APIC double write bug.
246 */ 277 */
247
248static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 278static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
249{ 279{
250 unsigned int lvtt_value, tmp_value; 280 unsigned int lvtt_value, tmp_value;
@@ -252,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
252 lvtt_value = LOCAL_TIMER_VECTOR; 282 lvtt_value = LOCAL_TIMER_VECTOR;
253 if (!oneshot) 283 if (!oneshot)
254 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 284 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
285 if (!lapic_is_integrated())
286 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
287
255 if (!irqen) 288 if (!irqen)
256 lvtt_value |= APIC_LVT_MASKED; 289 lvtt_value |= APIC_LVT_MASKED;
257 290
@@ -261,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
261 * Divide PICLK by 16 294 * Divide PICLK by 16
262 */ 295 */
263 tmp_value = apic_read(APIC_TDCR); 296 tmp_value = apic_read(APIC_TDCR);
264 apic_write(APIC_TDCR, (tmp_value 297 apic_write(APIC_TDCR,
265 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 298 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
266 | APIC_TDR_DIV_16); 299 APIC_TDR_DIV_16);
267 300
268 if (!oneshot) 301 if (!oneshot)
269 apic_write(APIC_TMICT, clocks); 302 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
270} 303}
271 304
272/* 305/*
@@ -436,7 +469,7 @@ static int __init calibrate_APIC_clock(void)
436 lapic_clockevent.min_delta_ns = 469 lapic_clockevent.min_delta_ns =
437 clockevent_delta2ns(0xF, &lapic_clockevent); 470 clockevent_delta2ns(0xF, &lapic_clockevent);
438 471
439 calibration_result = result / HZ; 472 calibration_result = (result * APIC_DIVISOR) / HZ;
440 473
441 /* 474 /*
442 * Do a sanity check on the APIC calibration result 475 * Do a sanity check on the APIC calibration result
@@ -458,10 +491,10 @@ static int __init calibrate_APIC_clock(void)
458void __init setup_boot_APIC_clock(void) 491void __init setup_boot_APIC_clock(void)
459{ 492{
460 /* 493 /*
461 * The local apic timer can be disabled via the kernel commandline. 494 * The local apic timer can be disabled via the kernel
462 * Register the lapic timer as a dummy clock event source on SMP 495 * commandline or from the CPU detection code. Register the lapic
463 * systems, so the broadcast mechanism is used. On UP systems simply 496 * timer as a dummy clock event source on SMP systems, so the
464 * ignore it. 497 * broadcast mechanism is used. On UP systems simply ignore it.
465 */ 498 */
466 if (disable_apic_timer) { 499 if (disable_apic_timer) {
467 printk(KERN_INFO "Disabling APIC timer\n"); 500 printk(KERN_INFO "Disabling APIC timer\n");
@@ -473,7 +506,9 @@ void __init setup_boot_APIC_clock(void)
473 return; 506 return;
474 } 507 }
475 508
476 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 509 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
510 "calibrating APIC timer ...\n");
511
477 if (calibrate_APIC_clock()) { 512 if (calibrate_APIC_clock()) {
478 /* No broadcast on UP ! */ 513 /* No broadcast on UP ! */
479 if (num_possible_cpus() > 1) 514 if (num_possible_cpus() > 1)
@@ -492,6 +527,7 @@ void __init setup_boot_APIC_clock(void)
492 printk(KERN_WARNING "APIC timer registered as dummy," 527 printk(KERN_WARNING "APIC timer registered as dummy,"
493 " due to nmi_watchdog=%d!\n", nmi_watchdog); 528 " due to nmi_watchdog=%d!\n", nmi_watchdog);
494 529
530 /* Setup the lapic or request the broadcast */
495 setup_APIC_timer(); 531 setup_APIC_timer();
496} 532}
497 533
@@ -530,7 +566,11 @@ static void local_apic_timer_interrupt(void)
530 /* 566 /*
531 * the NMI deadlock-detector uses this. 567 * the NMI deadlock-detector uses this.
532 */ 568 */
569#ifdef CONFIG_X86_64
533 add_pda(apic_timer_irqs, 1); 570 add_pda(apic_timer_irqs, 1);
571#else
572 per_cpu(irq_stat, cpu).apic_timer_irqs++;
573#endif
534 574
535 evt->event_handler(evt); 575 evt->event_handler(evt);
536} 576}
@@ -561,6 +601,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
561 irq_enter(); 601 irq_enter();
562 local_apic_timer_interrupt(); 602 local_apic_timer_interrupt();
563 irq_exit(); 603 irq_exit();
604
564 set_irq_regs(old_regs); 605 set_irq_regs(old_regs);
565} 606}
566 607
@@ -614,6 +655,13 @@ void clear_local_APIC(void)
614 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); 655 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
615 } 656 }
616 657
658 /* lets not touch this if we didn't frob it */
659#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
660 if (maxlvt >= 5) {
661 v = apic_read(APIC_LVTTHMR);
662 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
663 }
664#endif
617 /* 665 /*
618 * Clean APIC state for other OSs: 666 * Clean APIC state for other OSs:
619 */ 667 */
@@ -624,8 +672,14 @@ void clear_local_APIC(void)
624 apic_write(APIC_LVTERR, APIC_LVT_MASKED); 672 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
625 if (maxlvt >= 4) 673 if (maxlvt >= 4)
626 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 674 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
627 apic_write(APIC_ESR, 0); 675
628 apic_read(APIC_ESR); 676 /* Integrated APIC (!82489DX) ? */
677 if (lapic_is_integrated()) {
678 if (maxlvt > 3)
679 /* Clear ESR due to Pentium errata 3AP and 11AP */
680 apic_write(APIC_ESR, 0);
681 apic_read(APIC_ESR);
682 }
629} 683}
630 684
631/** 685/**
@@ -644,8 +698,28 @@ void disable_local_APIC(void)
644 value = apic_read(APIC_SPIV); 698 value = apic_read(APIC_SPIV);
645 value &= ~APIC_SPIV_APIC_ENABLED; 699 value &= ~APIC_SPIV_APIC_ENABLED;
646 apic_write(APIC_SPIV, value); 700 apic_write(APIC_SPIV, value);
701
702#ifdef CONFIG_X86_32
703 /*
704 * When LAPIC was disabled by the BIOS and enabled by the kernel,
705 * restore the disabled state.
706 */
707 if (enabled_via_apicbase) {
708 unsigned int l, h;
709
710 rdmsr(MSR_IA32_APICBASE, l, h);
711 l &= ~MSR_IA32_APICBASE_ENABLE;
712 wrmsr(MSR_IA32_APICBASE, l, h);
713 }
714#endif
647} 715}
648 716
717/*
718 * If Linux enabled the LAPIC against the BIOS default disable it down before
719 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
720 * not power-off. Additionally clear all LVT entries before disable_local_APIC
721 * for the case where Linux didn't enable the LAPIC.
722 */
649void lapic_shutdown(void) 723void lapic_shutdown(void)
650{ 724{
651 unsigned long flags; 725 unsigned long flags;
@@ -655,7 +729,13 @@ void lapic_shutdown(void)
655 729
656 local_irq_save(flags); 730 local_irq_save(flags);
657 731
658 disable_local_APIC(); 732#ifdef CONFIG_X86_32
733 if (!enabled_via_apicbase)
734 clear_local_APIC();
735 else
736#endif
737 disable_local_APIC();
738
659 739
660 local_irq_restore(flags); 740 local_irq_restore(flags);
661} 741}
@@ -726,8 +806,11 @@ int __init verify_local_APIC(void)
726 */ 806 */
727void __init sync_Arb_IDs(void) 807void __init sync_Arb_IDs(void)
728{ 808{
729 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 809 /*
730 if (modern_apic()) 810 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
811 * needed on AMD.
812 */
813 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
731 return; 814 return;
732 815
733 /* 816 /*
@@ -736,8 +819,8 @@ void __init sync_Arb_IDs(void)
736 apic_wait_icr_idle(); 819 apic_wait_icr_idle();
737 820
738 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 821 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
739 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 822 apic_write(APIC_ICR, APIC_DEST_ALLINC |
740 | APIC_DM_INIT); 823 APIC_INT_LEVELTRIG | APIC_DM_INIT);
741} 824}
742 825
743/* 826/*
@@ -754,8 +837,6 @@ void __init init_bsp_APIC(void)
754 if (smp_found_config || !cpu_has_apic) 837 if (smp_found_config || !cpu_has_apic)
755 return; 838 return;
756 839
757 value = apic_read(APIC_LVR);
758
759 /* 840 /*
760 * Do not trust the local APIC being empty at bootup. 841 * Do not trust the local APIC being empty at bootup.
761 */ 842 */
@@ -767,7 +848,15 @@ void __init init_bsp_APIC(void)
767 value = apic_read(APIC_SPIV); 848 value = apic_read(APIC_SPIV);
768 value &= ~APIC_VECTOR_MASK; 849 value &= ~APIC_VECTOR_MASK;
769 value |= APIC_SPIV_APIC_ENABLED; 850 value |= APIC_SPIV_APIC_ENABLED;
770 value |= APIC_SPIV_FOCUS_DISABLED; 851
852#ifdef CONFIG_X86_32
853 /* This bit is reserved on P4/Xeon and should be cleared */
854 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
855 (boot_cpu_data.x86 == 15))
856 value &= ~APIC_SPIV_FOCUS_DISABLED;
857 else
858#endif
859 value |= APIC_SPIV_FOCUS_DISABLED;
771 value |= SPURIOUS_APIC_VECTOR; 860 value |= SPURIOUS_APIC_VECTOR;
772 apic_write(APIC_SPIV, value); 861 apic_write(APIC_SPIV, value);
773 862
@@ -776,9 +865,50 @@ void __init init_bsp_APIC(void)
776 */ 865 */
777 apic_write(APIC_LVT0, APIC_DM_EXTINT); 866 apic_write(APIC_LVT0, APIC_DM_EXTINT);
778 value = APIC_DM_NMI; 867 value = APIC_DM_NMI;
868 if (!lapic_is_integrated()) /* 82489DX */
869 value |= APIC_LVT_LEVEL_TRIGGER;
779 apic_write(APIC_LVT1, value); 870 apic_write(APIC_LVT1, value);
780} 871}
781 872
873static void __cpuinit lapic_setup_esr(void)
874{
875 unsigned long oldvalue, value, maxlvt;
876 if (lapic_is_integrated() && !esr_disable) {
877 if (esr_disable) {
878 /*
879 * Something untraceable is creating bad interrupts on
880 * secondary quads ... for the moment, just leave the
881 * ESR disabled - we can't do anything useful with the
882 * errors anyway - mbligh
883 */
884 printk(KERN_INFO "Leaving ESR disabled.\n");
885 return;
886 }
887 /* !82489DX */
888 maxlvt = lapic_get_maxlvt();
889 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
890 apic_write(APIC_ESR, 0);
891 oldvalue = apic_read(APIC_ESR);
892
893 /* enables sending errors */
894 value = ERROR_APIC_VECTOR;
895 apic_write(APIC_LVTERR, value);
896 /*
897 * spec says clear errors after enabling vector.
898 */
899 if (maxlvt > 3)
900 apic_write(APIC_ESR, 0);
901 value = apic_read(APIC_ESR);
902 if (value != oldvalue)
903 apic_printk(APIC_VERBOSE, "ESR value before enabling "
904 "vector: 0x%08lx after: 0x%08lx\n",
905 oldvalue, value);
906 } else {
907 printk(KERN_INFO "No ESR for 82489DX.\n");
908 }
909}
910
911
782/** 912/**
783 * setup_local_APIC - setup the local APIC 913 * setup_local_APIC - setup the local APIC
784 */ 914 */
@@ -884,21 +1014,20 @@ void __cpuinit setup_local_APIC(void)
884 preempt_enable(); 1014 preempt_enable();
885} 1015}
886 1016
887static void __cpuinit lapic_setup_esr(void)
888{
889 unsigned maxlvt = lapic_get_maxlvt();
890
891 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
892 /*
893 * spec says clear errors after enabling vector.
894 */
895 if (maxlvt > 3)
896 apic_write(APIC_ESR, 0);
897}
898
899void __cpuinit end_local_APIC_setup(void) 1017void __cpuinit end_local_APIC_setup(void)
900{ 1018{
901 lapic_setup_esr(); 1019 lapic_setup_esr();
1020
1021#ifdef CONFIG_X86_32
1022 {
1023 unsigned int value;
1024 /* Disable the local apic timer */
1025 value = apic_read(APIC_LVTT);
1026 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1027 apic_write(APIC_LVTT, value);
1028 }
1029#endif
1030
902 setup_apic_nmi_watchdog(NULL); 1031 setup_apic_nmi_watchdog(NULL);
903 apic_pm_activate(); 1032 apic_pm_activate();
904} 1033}
@@ -1100,6 +1229,8 @@ void __init init_apic_mappings(void)
1100 * This initializes the IO-APIC and APIC hardware if this is 1229 * This initializes the IO-APIC and APIC hardware if this is
1101 * a UP kernel. 1230 * a UP kernel.
1102 */ 1231 */
1232int apic_version[MAX_APICS];
1233
1103int __init APIC_init_uniprocessor(void) 1234int __init APIC_init_uniprocessor(void)
1104{ 1235{
1105 if (disable_apic) { 1236 if (disable_apic) {
@@ -1201,17 +1332,57 @@ asmlinkage void smp_error_interrupt(void)
1201} 1332}
1202 1333
1203/** 1334/**
1204 * * connect_bsp_APIC - attach the APIC to the interrupt system 1335 * connect_bsp_APIC - attach the APIC to the interrupt system
1205 * */ 1336 */
1206void __init connect_bsp_APIC(void) 1337void __init connect_bsp_APIC(void)
1207{ 1338{
1339#ifdef CONFIG_X86_32
1340 if (pic_mode) {
1341 /*
1342 * Do not trust the local APIC being empty at bootup.
1343 */
1344 clear_local_APIC();
1345 /*
1346 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1347 * local APIC to INT and NMI lines.
1348 */
1349 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1350 "enabling APIC mode.\n");
1351 outb(0x70, 0x22);
1352 outb(0x01, 0x23);
1353 }
1354#endif
1208 enable_apic_mode(); 1355 enable_apic_mode();
1209} 1356}
1210 1357
1358/**
1359 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1360 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1361 *
1362 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1363 * APIC is disabled.
1364 */
1211void disconnect_bsp_APIC(int virt_wire_setup) 1365void disconnect_bsp_APIC(int virt_wire_setup)
1212{ 1366{
1367 unsigned int value;
1368
1369#ifdef CONFIG_X86_32
1370 if (pic_mode) {
1371 /*
1372 * Put the board back into PIC mode (has an effect only on
1373 * certain older boards). Note that APIC interrupts, including
1374 * IPIs, won't work beyond this point! The only exception are
1375 * INIT IPIs.
1376 */
1377 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1378 "entering PIC mode.\n");
1379 outb(0x70, 0x22);
1380 outb(0x00, 0x23);
1381 return;
1382 }
1383#endif
1384
1213 /* Go back to Virtual Wire compatibility mode */ 1385 /* Go back to Virtual Wire compatibility mode */
1214 unsigned long value;
1215 1386
1216 /* For the spurious interrupt use vector F, and enable it */ 1387 /* For the spurious interrupt use vector F, and enable it */
1217 value = apic_read(APIC_SPIV); 1388 value = apic_read(APIC_SPIV);
@@ -1237,7 +1408,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1237 apic_write(APIC_LVT0, APIC_LVT_MASKED); 1408 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1238 } 1409 }
1239 1410
1240 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 1411 /*
1412 * For LVT1 make it edge triggered, active high,
1413 * nmi and enabled
1414 */
1241 value = apic_read(APIC_LVT1); 1415 value = apic_read(APIC_LVT1);
1242 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1416 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1243 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1417 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1252,9 +1426,20 @@ void __cpuinit generic_processor_info(int apicid, int version)
1252 int cpu; 1426 int cpu;
1253 cpumask_t tmp_map; 1427 cpumask_t tmp_map;
1254 1428
1429 /*
1430 * Validate version
1431 */
1432 if (version == 0x0) {
1433 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
1434 "fixing up to 0x10. (tell your hw vendor)\n",
1435 version);
1436 version = 0x10;
1437 }
1438 apic_version[apicid] = version;
1439
1255 if (num_processors >= NR_CPUS) { 1440 if (num_processors >= NR_CPUS) {
1256 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1441 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1257 " Processor ignored.\n", NR_CPUS); 1442 " Processor ignored.\n", NR_CPUS);
1258 return; 1443 return;
1259 } 1444 }
1260 1445
@@ -1274,6 +1459,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1274 if (apicid > max_physical_apicid) 1459 if (apicid > max_physical_apicid)
1275 max_physical_apicid = apicid; 1460 max_physical_apicid = apicid;
1276 1461
1462#ifdef CONFIG_X86_32
1463 /*
1464 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1465 * but we need to work other dependencies like SMP_SUSPEND etc
1466 * before this can be done without some confusion.
1467 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1468 * - Ashok Raj <ashok.raj@intel.com>
1469 */
1470 if (max_physical_apicid >= 8) {
1471 switch (boot_cpu_data.x86_vendor) {
1472 case X86_VENDOR_INTEL:
1473 if (!APIC_XAPIC(version)) {
1474 def_to_bigsmp = 0;
1475 break;
1476 }
1477 /* If P4 and above fall through */
1478 case X86_VENDOR_AMD:
1479 def_to_bigsmp = 1;
1480 }
1481 }
1482#endif
1483
1484#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1277 /* are we being called early in kernel startup? */ 1485 /* are we being called early in kernel startup? */
1278 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1486 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1279 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1487 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1285,6 +1493,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1285 per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1493 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1286 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1494 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1287 } 1495 }
1496#endif
1288 1497
1289 cpu_set(cpu, cpu_possible_map); 1498 cpu_set(cpu, cpu_possible_map);
1290 cpu_set(cpu, cpu_present_map); 1499 cpu_set(cpu, cpu_present_map);
@@ -1301,9 +1510,11 @@ int hard_smp_processor_id(void)
1301#ifdef CONFIG_PM 1510#ifdef CONFIG_PM
1302 1511
1303static struct { 1512static struct {
1304 /* 'active' is true if the local APIC was enabled by us and 1513 /*
1305 not the BIOS; this signifies that we are also responsible 1514 * 'active' is true if the local APIC was enabled by us and
1306 for disabling it before entering apm/acpi suspend */ 1515 * not the BIOS; this signifies that we are also responsible
1516 * for disabling it before entering apm/acpi suspend
1517 */
1307 int active; 1518 int active;
1308 /* r/w apic fields */ 1519 /* r/w apic fields */
1309 unsigned int apic_id; 1520 unsigned int apic_id;
@@ -1344,10 +1555,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1344 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1555 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1345 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1556 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1346 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1557 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1347#ifdef CONFIG_X86_MCE_INTEL 1558#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1348 if (maxlvt >= 5) 1559 if (maxlvt >= 5)
1349 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1560 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1350#endif 1561#endif
1562
1351 local_irq_save(flags); 1563 local_irq_save(flags);
1352 disable_local_APIC(); 1564 disable_local_APIC();
1353 local_irq_restore(flags); 1565 local_irq_restore(flags);
@@ -1366,13 +1578,24 @@ static int lapic_resume(struct sys_device *dev)
1366 maxlvt = lapic_get_maxlvt(); 1578 maxlvt = lapic_get_maxlvt();
1367 1579
1368 local_irq_save(flags); 1580 local_irq_save(flags);
1369 if (!x2apic) { 1581
1582#ifdef CONFIG_X86_64
1583 if (x2apic)
1584 enable_x2apic();
1585 else
1586#endif
1587 {
1588 /*
1589 * Make sure the APICBASE points to the right address
1590 *
1591 * FIXME! This will be wrong if we ever support suspend on
1592 * SMP! We'll need to do this as part of the CPU restore!
1593 */
1370 rdmsr(MSR_IA32_APICBASE, l, h); 1594 rdmsr(MSR_IA32_APICBASE, l, h);
1371 l &= ~MSR_IA32_APICBASE_BASE; 1595 l &= ~MSR_IA32_APICBASE_BASE;
1372 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1596 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1373 wrmsr(MSR_IA32_APICBASE, l, h); 1597 wrmsr(MSR_IA32_APICBASE, l, h);
1374 } else 1598 }
1375 enable_x2apic();
1376 1599
1377 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1600 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1378 apic_write(APIC_ID, apic_pm_state.apic_id); 1601 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1382,7 +1605,7 @@ static int lapic_resume(struct sys_device *dev)
1382 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1605 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1383 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1606 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1384 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1607 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1385#ifdef CONFIG_X86_MCE_INTEL 1608#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1386 if (maxlvt >= 5) 1609 if (maxlvt >= 5)
1387 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1610 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1388#endif 1611#endif
@@ -1396,10 +1619,17 @@ static int lapic_resume(struct sys_device *dev)
1396 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1619 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1397 apic_write(APIC_ESR, 0); 1620 apic_write(APIC_ESR, 0);
1398 apic_read(APIC_ESR); 1621 apic_read(APIC_ESR);
1622
1399 local_irq_restore(flags); 1623 local_irq_restore(flags);
1624
1400 return 0; 1625 return 0;
1401} 1626}
1402 1627
1628/*
1629 * This device has no shutdown method - fully functioning local APICs
1630 * are needed on every CPU up until machine_halt/restart/poweroff.
1631 */
1632
1403static struct sysdev_class lapic_sysclass = { 1633static struct sysdev_class lapic_sysclass = {
1404 .name = "lapic", 1634 .name = "lapic",
1405 .resume = lapic_resume, 1635 .resume = lapic_resume,
@@ -1525,28 +1755,7 @@ early_param("nox2apic", setup_nox2apic);
1525/* 1755/*
1526 * APIC command line parameters 1756 * APIC command line parameters
1527 */ 1757 */
1528static int __init apic_set_verbosity(char *str) 1758static int __init setup_disableapic(char *arg)
1529{
1530 if (str == NULL) {
1531 skip_ioapic_setup = 0;
1532 ioapic_force = 1;
1533 return 0;
1534 }
1535 if (strcmp("debug", str) == 0)
1536 apic_verbosity = APIC_DEBUG;
1537 else if (strcmp("verbose", str) == 0)
1538 apic_verbosity = APIC_VERBOSE;
1539 else {
1540 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1541 " use apic=verbose or apic=debug\n", str);
1542 return -EINVAL;
1543 }
1544
1545 return 0;
1546}
1547early_param("apic", apic_set_verbosity);
1548
1549static __init int setup_disableapic(char *str)
1550{ 1759{
1551 disable_apic = 1; 1760 disable_apic = 1;
1552 setup_clear_cpu_cap(X86_FEATURE_APIC); 1761 setup_clear_cpu_cap(X86_FEATURE_APIC);
@@ -1555,9 +1764,9 @@ static __init int setup_disableapic(char *str)
1555early_param("disableapic", setup_disableapic); 1764early_param("disableapic", setup_disableapic);
1556 1765
1557/* same as disableapic, for compatibility */ 1766/* same as disableapic, for compatibility */
1558static __init int setup_nolapic(char *str) 1767static int __init setup_nolapic(char *arg)
1559{ 1768{
1560 return setup_disableapic(str); 1769 return setup_disableapic(arg);
1561} 1770}
1562early_param("nolapic", setup_nolapic); 1771early_param("nolapic", setup_nolapic);
1563 1772
@@ -1568,14 +1777,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1568} 1777}
1569early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1778early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1570 1779
1571static __init int setup_noapictimer(char *str) 1780static int __init parse_disable_apic_timer(char *arg)
1572{ 1781{
1573 if (str[0] != ' ' && str[0] != 0)
1574 return 0;
1575 disable_apic_timer = 1; 1782 disable_apic_timer = 1;
1576 return 1; 1783 return 0;
1577} 1784}
1578__setup("noapictimer", setup_noapictimer); 1785early_param("noapictimer", parse_disable_apic_timer);
1786
1787static int __init parse_nolapic_timer(char *arg)
1788{
1789 disable_apic_timer = 1;
1790 return 0;
1791}
1792early_param("nolapic_timer", parse_nolapic_timer);
1579 1793
1580static __init int setup_apicpmtimer(char *s) 1794static __init int setup_apicpmtimer(char *s)
1581{ 1795{
@@ -1585,6 +1799,31 @@ static __init int setup_apicpmtimer(char *s)
1585} 1799}
1586__setup("apicpmtimer", setup_apicpmtimer); 1800__setup("apicpmtimer", setup_apicpmtimer);
1587 1801
1802static int __init apic_set_verbosity(char *arg)
1803{
1804 if (!arg) {
1805#ifdef CONFIG_X86_64
1806 skip_ioapic_setup = 0;
1807 ioapic_force = 1;
1808 return 0;
1809#endif
1810 return -EINVAL;
1811 }
1812
1813 if (strcmp("debug", arg) == 0)
1814 apic_verbosity = APIC_DEBUG;
1815 else if (strcmp("verbose", arg) == 0)
1816 apic_verbosity = APIC_VERBOSE;
1817 else {
1818 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1819 " use apic=verbose or apic=debug\n", arg);
1820 return -EINVAL;
1821 }
1822
1823 return 0;
1824}
1825early_param("apic", apic_set_verbosity);
1826
1588static int __init lapic_insert_resource(void) 1827static int __init lapic_insert_resource(void)
1589{ 1828{
1590 if (!apic_phys) 1829 if (!apic_phys)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index b93d069aea72..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -233,6 +233,7 @@
233#include <asm/uaccess.h> 233#include <asm/uaccess.h>
234#include <asm/desc.h> 234#include <asm/desc.h>
235#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
236#include <asm/paravirt.h> 237#include <asm/paravirt.h>
237#include <asm/reboot.h> 238#include <asm/reboot.h>
238 239
@@ -2216,7 +2217,7 @@ static int __init apm_init(void)
2216 2217
2217 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2218 2219
2219 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2220 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2221 return -ENODEV; 2222 return -ENODEV;
2222 } 2223 }
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3ede19a4e0b2..7f0b45a5d788 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,19 +3,17 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10 10
11obj-$(CONFIG_CPU_SUP_AMD_32) += amd.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_CPU_SUP_AMD_64) += amd_64.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
17obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o
18obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o
19obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 17obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
20 18
21obj-$(CONFIG_X86_MCE) += mcheck/ 19obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index a6ef672adbba..0d9c993aa93e 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h>
11
10struct cpuid_bit { 12struct cpuid_bit {
11 u16 feature; 13 u16 feature;
12 u8 reg; 14 u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
48 } 50 }
49} 51}
50 52
53/* leaf 0xb SMT level */
54#define SMT_LEVEL 0
55
56/* leaf 0xb sub-leaf types */
57#define INVALID_TYPE 0
58#define SMT_TYPE 1
59#define CORE_TYPE 2
60
61#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
62#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
63#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
64
65/*
66 * Check for extended topology enumeration cpuid leaf 0xb and if it
67 * exists, use it for populating initial_apicid and cpu topology
68 * detection.
69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{
72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings;
76
77 if (c->cpuid_level < 0xb)
78 return;
79
80 cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
81
82 /*
83 * check if the cpuid leaf 0xb is actually implemented.
84 */
85 if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
86 return;
87
88 set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
89
90 /*
91 * initial apic id, which also represents 32-bit extended x2apic id.
92 */
93 c->initial_apicid = edx;
94
95 /*
96 * Populate HT related information from sub-leaf level 0.
97 */
98 core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
99 core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
100
101 sub_index = 1;
102 do {
103 cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
104
105 /*
106 * Check for the Core type in the implemented sub leaves.
107 */
108 if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
109 core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
110 core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
111 break;
112 }
113
114 sub_index++;
115 } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118
119#ifdef CONFIG_X86_32
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
126#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128
129
130 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
131 c->phys_proc_id);
132 if (c->x86_max_cores > 1)
133 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
134 c->cpu_core_id);
135 return;
136#endif
137}
138
51#ifdef CONFIG_X86_PAT 139#ifdef CONFIG_X86_PAT
52void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
53{ 141{
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 18514ed26104..32e73520adf7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,26 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28{ 37{
29 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
30 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
31 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
33 } 50 }
34
35 /* Set MTRR capability flag if appropriate */
36 if (c->x86_model == 13 || c->x86_model == 9 ||
37 (c->x86_model == 8 && c->x86_mask >= 8))
38 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
39} 51}
40 52
41static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
42{ 55{
43 u32 l, h; 56 u32 l, h;
44 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
45 int r;
46 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
47#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
48 unsigned long long value; 304 unsigned long long value;
49 305
@@ -54,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
54 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
55 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
56 */ 312 */
57 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
58 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
59 value |= 1 << 6; 315 value |= 1 << 6;
60 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -64,209 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
64 early_init_amd(c); 320 early_init_amd(c);
65 321
66 /* 322 /*
67 * FIXME: We should handle the K5 here. Set up the write
68 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
69 * no bus pipeline)
70 */
71
72 /*
73 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
74 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
75 */ 325 */
76 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
77 327
78 r = get_model_name(c); 328#ifdef CONFIG_X86_64
329 /* On C+ stepping K8 rep microcode works well for copy/memset */
330 if (c->x86 == 0xf) {
331 u32 level;
79 332
80 switch (c->x86) { 333 level = cpuid_eax(1);
81 case 4: 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
82 /* 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
83 * General Systems BIOSen alias the cpu frequency registers
84 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
85 * drivers subsequently pokes it, and changes the CPU speed.
86 * Workaround : Remove the unneeded alias.
87 */
88#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
89#define CBAR_ENB (0x80000000)
90#define CBAR_KEY (0X000000CB)
91 if (c->x86_model == 9 || c->x86_model == 10) {
92 if (inl (CBAR) & CBAR_ENB)
93 outl (0 | CBAR_KEY, CBAR);
94 }
95 break;
96 case 5:
97 if (c->x86_model < 6) {
98 /* Based on AMD doc 20734R - June 2000 */
99 if (c->x86_model == 0) {
100 clear_cpu_cap(c, X86_FEATURE_APIC);
101 set_cpu_cap(c, X86_FEATURE_PGE);
102 }
103 break;
104 }
105
106 if (c->x86_model == 6 && c->x86_mask == 1) {
107 const int K6_BUG_LOOP = 1000000;
108 int n;
109 void (*f_vide)(void);
110 unsigned long d, d2;
111
112 printk(KERN_INFO "AMD K6 stepping B detected - ");
113
114 /*
115 * It looks like AMD fixed the 2.6.2 bug and improved indirect
116 * calls at the same time.
117 */
118
119 n = K6_BUG_LOOP;
120 f_vide = vide;
121 rdtscl(d);
122 while (n--)
123 f_vide();
124 rdtscl(d2);
125 d = d2-d;
126
127 if (d > 20*K6_BUG_LOOP)
128 printk("system stability may be impaired when more than 32 MB are used.\n");
129 else
130 printk("probably OK (after B9730xxxx).\n");
131 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
132 }
133
134 /* K6 with old style WHCR */
135 if (c->x86_model < 8 ||
136 (c->x86_model == 8 && c->x86_mask < 8)) {
137 /* We can only write allocate on the low 508Mb */
138 if (mbytes > 508)
139 mbytes = 508;
140
141 rdmsr(MSR_K6_WHCR, l, h);
142 if ((l&0x0000FFFF) == 0) {
143 unsigned long flags;
144 l = (1<<0)|((mbytes/4)<<1);
145 local_irq_save(flags);
146 wbinvd();
147 wrmsr(MSR_K6_WHCR, l, h);
148 local_irq_restore(flags);
149 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
150 mbytes);
151 }
152 break;
153 }
154
155 if ((c->x86_model == 8 && c->x86_mask > 7) ||
156 c->x86_model == 9 || c->x86_model == 13) {
157 /* The more serious chips .. */
158
159 if (mbytes > 4092)
160 mbytes = 4092;
161
162 rdmsr(MSR_K6_WHCR, l, h);
163 if ((l&0xFFFF0000) == 0) {
164 unsigned long flags;
165 l = ((mbytes>>2)<<22)|(1<<16);
166 local_irq_save(flags);
167 wbinvd();
168 wrmsr(MSR_K6_WHCR, l, h);
169 local_irq_restore(flags);
170 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
171 mbytes);
172 }
173
174 break;
175 }
176
177 if (c->x86_model == 10) {
178 /* AMD Geode LX is model 10 */
179 /* placeholder for any needed mods */
180 break;
181 }
182 break;
183 case 6: /* An Athlon/Duron */
184
185 /*
186 * Bit 15 of Athlon specific MSR 15, needs to be 0
187 * to enable SSE on Palomino/Morgan/Barton CPU's.
188 * If the BIOS didn't enable it already, enable it here.
189 */
190 if (c->x86_model >= 6 && c->x86_model <= 10) {
191 if (!cpu_has(c, X86_FEATURE_XMM)) {
192 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
193 rdmsr(MSR_K7_HWCR, l, h);
194 l &= ~0x00008000;
195 wrmsr(MSR_K7_HWCR, l, h);
196 set_cpu_cap(c, X86_FEATURE_XMM);
197 }
198 }
199
200 /*
201 * It's been determined by AMD that Athlons since model 8 stepping 1
202 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
203 * As per AMD technical note 27212 0.2
204 */
205 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
206 rdmsr(MSR_K7_CLK_CTL, l, h);
207 if ((l & 0xfff00000) != 0x20000000) {
208 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
209 ((l & 0x000fffff)|0x20000000));
210 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
211 }
212 }
213 break;
214 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
215 346
216 switch (c->x86) { 347 switch (c->x86) {
217 case 15: 348 case 4:
218 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
219 case 0x10:
220 case 0x11:
221 set_cpu_cap(c, X86_FEATURE_K8);
222 break; 350 break;
223 case 6: 351 case 5:
224 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
225 break; 356 break;
226 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
227 if (c->x86 >= 6) 365 if (c->x86 >= 6)
228 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
229 367
230 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
231 369 switch (c->x86) {
232 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
233 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
234 377
235#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
236 /*
237 * On a AMD multi core setup the lower bits of the APIC id
238 * distinguish the cores.
239 */
240 if (c->x86_max_cores > 1) {
241 int cpu = smp_processor_id();
242 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
243 379
244 if (bits == 0) { 380 /* Multi core CPU? */
245 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
246 bits++; 382 amd_detect_cmp(c);
247 } 383 srat_detect_node(c);
248 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
249 c->phys_proc_id >>= bits;
250 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
251 cpu, c->x86_max_cores, c->cpu_core_id);
252 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
253#endif 388#endif
254 389
255 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
256 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
257 num_cache_leaves = 4; 392 num_cache_leaves = 4;
258 else 393 else
259 num_cache_leaves = 3; 394 num_cache_leaves = 3;
260 } 395 }
261 396
262 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
263 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
264 clear_cpu_cap(c, X86_FEATURE_MCE);
265 399
266 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
267 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
268} 433}
269 434
435#ifdef CONFIG_X86_32
270static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
271{ 437{
272 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -279,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
279 } 445 }
280 return size; 446 return size;
281} 447}
448#endif
282 449
283static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
284 .c_vendor = "AMD", 451 .c_vendor = "AMD",
285 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
286 .c_models = { 454 .c_models = {
287 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
288 { 456 {
@@ -295,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
295 } 463 }
296 }, 464 },
297 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
298 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
299 .c_init = init_amd, 469 .c_init = init_amd,
300 .c_size_cache = amd_size_cache, 470 .c_x86_vendor = X86_VENDOR_AMD,
301}; 471};
302 472
303cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); 473cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1692b2a41ff..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
120}
121
122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
123{
124 unsigned level;
125
126#ifdef CONFIG_SMP
127 unsigned long value;
128
129 /*
130 * Disable TLB flush filter by setting HWCR.FFDIS on K8
131 * bit 6 of msr C001_0015
132 *
133 * Errata 63 for SH-B3 steppings
134 * Errata 122 for all steppings (F+ have it disabled by default)
135 */
136 if (c->x86 == 0xf) {
137 rdmsrl(MSR_K8_HWCR, value);
138 value |= 1 << 6;
139 wrmsrl(MSR_K8_HWCR, value);
140 }
141#endif
142
143 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
144 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
145 clear_cpu_cap(c, 0*32+31);
146
147 /* On C+ stepping K8 rep microcode works well for copy/memset */
148 if (c->x86 == 0xf) {
149 level = cpuid_eax(1);
150 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
151 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
152 }
153 if (c->x86 == 0x10 || c->x86 == 0x11)
154 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
155
156 /* Enable workaround for FXSAVE leak */
157 if (c->x86 >= 6)
158 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
159
160 level = get_model_name(c);
161 if (!level) {
162 switch (c->x86) {
163 case 0xf:
164 /* Should distinguish Models here, but this is only
165 a fallback anyways. */
166 strcpy(c->x86_model_id, "Hammer");
167 break;
168 }
169 }
170 display_cacheinfo(c);
171
172 /* Multi core CPU? */
173 if (c->extended_cpuid_level >= 0x80000008)
174 amd_detect_cmp(c);
175
176 if (c->extended_cpuid_level >= 0x80000006 &&
177 (cpuid_edx(0x80000006) & 0xf000))
178 num_cache_leaves = 4;
179 else
180 num_cache_leaves = 3;
181
182 if (c->x86 >= 0xf && c->x86 <= 0x11)
183 set_cpu_cap(c, X86_FEATURE_K8);
184
185 /* MFENCE stops RDTSC speculation */
186 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
187
188 if (c->x86 == 0x10) {
189 /* do this for boot cpu */
190 if (c == &boot_cpu_data)
191 check_enable_amd_mmconf_dmi();
192
193 fam10h_check_enable_mmcfg();
194 }
195
196 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
197 unsigned long long tseg;
198
199 /*
200 * Split up direct mapping around the TSEG SMM area.
201 * Don't do it for gbpages because there seems very little
202 * benefit in doing so.
203 */
204 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
205 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
206 if ((tseg>>PMD_SHIFT) <
207 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
208 ((tseg>>PMD_SHIFT) <
209 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
210 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
211 set_memory_4k((unsigned long)__va(tseg), 1);
212 }
213 }
214}
215
216static struct cpu_dev amd_cpu_dev __cpuinitdata = {
217 .c_vendor = "AMD",
218 .c_ident = { "AuthenticAMD" },
219 .c_early_init = early_init_amd,
220 .c_init = init_amd,
221};
222
223cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
224
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index a0534c04d38a..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
@@ -475,6 +474,7 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
475 .c_early_init = early_init_centaur, 474 .c_early_init = early_init_centaur,
476 .c_init = init_centaur, 475 .c_init = init_centaur,
477 .c_size_cache = centaur_size_cache, 476 .c_size_cache = centaur_size_cache,
477 .c_x86_vendor = X86_VENDOR_CENTAUR,
478}; 478};
479 479
480cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 480cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e1..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_ident = { "CentaurHauls" }, 30 .c_ident = { "CentaurHauls" },
30 .c_early_init = early_init_centaur, 31 .c_early_init = early_init_centaur,
31 .c_init = init_centaur, 32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
32}; 34};
33 35
34cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 36cpu_dev_register(centaur_cpu_dev);
35 37
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 8260d930eabc..7581b62df184 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,28 +1,62 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
16#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h>
17#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
18#include <asm/mpspec.h> 24#include <asm/mpspec.h>
19#include <asm/apic.h> 25#include <asm/apic.h>
20#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
21#endif 28#endif
22 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
23#include "cpu.h" 39#include "cpu.h"
24 40
41static struct cpu_dev *this_cpu __cpuinitdata;
42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
25DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
26 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
27 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
28 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 62 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -56,17 +90,150 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
56 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
57 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
58} }; 92} };
93#endif
59EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
60 95
61__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 96#ifdef CONFIG_X86_32
62
63static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
64static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
65 99
66struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 100static int __init cachesize_setup(char *str)
101{
102 get_option(&str, &cachesize_override);
103 return 1;
104}
105__setup("cachesize=", cachesize_setup);
106
107static int __init x86_fxsr_setup(char *s)
108{
109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
110 setup_clear_cpu_cap(X86_FEATURE_XMM);
111 return 1;
112}
113__setup("nofxsr", x86_fxsr_setup);
114
115static int __init x86_sep_setup(char *s)
116{
117 setup_clear_cpu_cap(X86_FEATURE_SEP);
118 return 1;
119}
120__setup("nosep", x86_sep_setup);
121
122/* Standard macro to see if a specific flag is changeable */
123static inline int flag_is_changeable_p(u32 flag)
124{
125 u32 f1, f2;
126
127 asm("pushfl\n\t"
128 "pushfl\n\t"
129 "popl %0\n\t"
130 "movl %0,%1\n\t"
131 "xorl %2,%0\n\t"
132 "pushl %0\n\t"
133 "popfl\n\t"
134 "pushfl\n\t"
135 "popl %0\n\t"
136 "popfl\n\t"
137 : "=&r" (f1), "=&r" (f2)
138 : "ir" (flag));
139
140 return ((f1^f2) & flag) != 0;
141}
142
143/* Probe for the CPUID instruction */
144static int __cpuinit have_cpuid_p(void)
145{
146 return flag_is_changeable_p(X86_EFLAGS_ID);
147}
148
149static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
150{
151 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
152 /* Disable processor serial number */
153 unsigned long lo, hi;
154 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
155 lo |= 0x200000;
156 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
157 printk(KERN_NOTICE "CPU serial number disabled.\n");
158 clear_cpu_cap(c, X86_FEATURE_PN);
159
160 /* Disabling the serial number may affect the cpuid level */
161 c->cpuid_level = cpuid_eax(0);
162 }
163}
164
165static int __init x86_serial_nr_setup(char *s)
166{
167 disable_x86_serial_nr = 0;
168 return 1;
169}
170__setup("serialnumber", x86_serial_nr_setup);
171#else
172static inline int flag_is_changeable_p(u32 flag)
173{
174 return 1;
175}
176/* Probe for the CPUID instruction */
177static inline int have_cpuid_p(void)
178{
179 return 1;
180}
181static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
182{
183}
184#endif
185
186/*
187 * Naming convention should be: <Name> [(<Codename>)]
188 * This table only is used unless init_<vendor>() below doesn't set it;
189 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
190 *
191 */
192
193/* Look up CPU names by table lookup. */
194static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
195{
196 struct cpu_model_info *info;
197
198 if (c->x86_model >= 16)
199 return NULL; /* Range check */
200
201 if (!this_cpu)
202 return NULL;
203
204 info = this_cpu->c_models;
205
206 while (info && info->family) {
207 if (info->family == c->x86)
208 return info->model_names[c->x86_model];
209 info++;
210 }
211 return NULL; /* Not found */
212}
213
214__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
215
216/* Current gdt points %fs at the "master" per-cpu area: after this,
217 * it's on the real one. */
218void switch_to_new_gdt(void)
219{
220 struct desc_ptr gdt_descr;
221
222 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
223 gdt_descr.size = GDT_SIZE - 1;
224 load_gdt(&gdt_descr);
225#ifdef CONFIG_X86_32
226 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
227#endif
228}
229
230static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
67 231
68static void __cpuinit default_init(struct cpuinfo_x86 *c) 232static void __cpuinit default_init(struct cpuinfo_x86 *c)
69{ 233{
234#ifdef CONFIG_X86_64
235 display_cacheinfo(c);
236#else
70 /* Not much we can do here... */ 237 /* Not much we can do here... */
71 /* Check if at least it has cpuid */ 238 /* Check if at least it has cpuid */
72 if (c->cpuid_level == -1) { 239 if (c->cpuid_level == -1) {
@@ -76,28 +243,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
76 else if (c->x86 == 3) 243 else if (c->x86 == 3)
77 strcpy(c->x86_model_id, "386"); 244 strcpy(c->x86_model_id, "386");
78 } 245 }
246#endif
79} 247}
80 248
81static struct cpu_dev __cpuinitdata default_cpu = { 249static struct cpu_dev __cpuinitdata default_cpu = {
82 .c_init = default_init, 250 .c_init = default_init,
83 .c_vendor = "Unknown", 251 .c_vendor = "Unknown",
252 .c_x86_vendor = X86_VENDOR_UNKNOWN,
84}; 253};
85static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
86 254
87static int __init cachesize_setup(char *str) 255static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
88{
89 get_option(&str, &cachesize_override);
90 return 1;
91}
92__setup("cachesize=", cachesize_setup);
93
94int __cpuinit get_model_name(struct cpuinfo_x86 *c)
95{ 256{
96 unsigned int *v; 257 unsigned int *v;
97 char *p, *q; 258 char *p, *q;
98 259
99 if (cpuid_eax(0x80000000) < 0x80000004) 260 if (c->extended_cpuid_level < 0x80000004)
100 return 0; 261 return;
101 262
102 v = (unsigned int *) c->x86_model_id; 263 v = (unsigned int *) c->x86_model_id;
103 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 264 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -116,30 +277,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
116 while (q <= &c->x86_model_id[48]) 277 while (q <= &c->x86_model_id[48])
117 *q++ = '\0'; /* Zero-pad the rest */ 278 *q++ = '\0'; /* Zero-pad the rest */
118 } 279 }
119
120 return 1;
121} 280}
122 281
123
124void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 282void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
125{ 283{
126 unsigned int n, dummy, ecx, edx, l2size; 284 unsigned int n, dummy, ebx, ecx, edx, l2size;
127 285
128 n = cpuid_eax(0x80000000); 286 n = c->extended_cpuid_level;
129 287
130 if (n >= 0x80000005) { 288 if (n >= 0x80000005) {
131 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); 289 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
132 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 290 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
133 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 291 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
134 c->x86_cache_size = (ecx>>24)+(edx>>24); 292 c->x86_cache_size = (ecx>>24) + (edx>>24);
293#ifdef CONFIG_X86_64
294 /* On K8 L1 TLB is inclusive, so don't count it */
295 c->x86_tlbsize = 0;
296#endif
135 } 297 }
136 298
137 if (n < 0x80000006) /* Some chips just has a large L1. */ 299 if (n < 0x80000006) /* Some chips just has a large L1. */
138 return; 300 return;
139 301
140 ecx = cpuid_ecx(0x80000006); 302 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
141 l2size = ecx >> 16; 303 l2size = ecx >> 16;
142 304
305#ifdef CONFIG_X86_64
306 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
307#else
143 /* do processor-specific cache resizing */ 308 /* do processor-specific cache resizing */
144 if (this_cpu->c_size_cache) 309 if (this_cpu->c_size_cache)
145 l2size = this_cpu->c_size_cache(c, l2size); 310 l2size = this_cpu->c_size_cache(c, l2size);
@@ -150,116 +315,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
150 315
151 if (l2size == 0) 316 if (l2size == 0)
152 return; /* Again, no L2 cache is possible */ 317 return; /* Again, no L2 cache is possible */
318#endif
153 319
154 c->x86_cache_size = l2size; 320 c->x86_cache_size = l2size;
155 321
156 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 322 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
157 l2size, ecx & 0xFF); 323 l2size, ecx & 0xFF);
158} 324}
159 325
160/* 326void __cpuinit detect_ht(struct cpuinfo_x86 *c)
161 * Naming convention should be: <Name> [(<Codename>)]
162 * This table only is used unless init_<vendor>() below doesn't set it;
163 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
164 *
165 */
166
167/* Look up CPU names by table lookup. */
168static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
169{ 327{
170 struct cpu_model_info *info; 328#ifdef CONFIG_X86_HT
329 u32 eax, ebx, ecx, edx;
330 int index_msb, core_bits;
171 331
172 if (c->x86_model >= 16) 332 if (!cpu_has(c, X86_FEATURE_HT))
173 return NULL; /* Range check */ 333 return;
174 334
175 if (!this_cpu) 335 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
176 return NULL; 336 goto out;
177 337
178 info = this_cpu->c_models; 338 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
339 return;
179 340
180 while (info && info->family) { 341 cpuid(1, &eax, &ebx, &ecx, &edx);
181 if (info->family == c->x86) 342
182 return info->model_names[c->x86_model]; 343 smp_num_siblings = (ebx & 0xff0000) >> 16;
183 info++; 344
345 if (smp_num_siblings == 1) {
346 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
347 } else if (smp_num_siblings > 1) {
348
349 if (smp_num_siblings > NR_CPUS) {
350 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
351 smp_num_siblings);
352 smp_num_siblings = 1;
353 return;
354 }
355
356 index_msb = get_count_order(smp_num_siblings);
357#ifdef CONFIG_X86_64
358 c->phys_proc_id = phys_pkg_id(index_msb);
359#else
360 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
361#endif
362
363 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
364
365 index_msb = get_count_order(smp_num_siblings);
366
367 core_bits = get_count_order(c->x86_max_cores);
368
369#ifdef CONFIG_X86_64
370 c->cpu_core_id = phys_pkg_id(index_msb) &
371 ((1 << core_bits) - 1);
372#else
373 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
374 ((1 << core_bits) - 1);
375#endif
184 } 376 }
185 return NULL; /* Not found */
186}
187 377
378out:
379 if ((c->x86_max_cores * smp_num_siblings) > 1) {
380 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
381 c->phys_proc_id);
382 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
383 c->cpu_core_id);
384 }
385#endif
386}
188 387
189static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) 388static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
190{ 389{
191 char *v = c->x86_vendor_id; 390 char *v = c->x86_vendor_id;
192 int i; 391 int i;
193 static int printed; 392 static int printed;
194 393
195 for (i = 0; i < X86_VENDOR_NUM; i++) { 394 for (i = 0; i < X86_VENDOR_NUM; i++) {
196 if (cpu_devs[i]) { 395 if (!cpu_devs[i])
197 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 396 break;
198 (cpu_devs[i]->c_ident[1] && 397
199 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 398 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
200 c->x86_vendor = i; 399 (cpu_devs[i]->c_ident[1] &&
201 if (!early) 400 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
202 this_cpu = cpu_devs[i]; 401 this_cpu = cpu_devs[i];
203 return; 402 c->x86_vendor = this_cpu->c_x86_vendor;
204 } 403 return;
205 } 404 }
206 } 405 }
406
207 if (!printed) { 407 if (!printed) {
208 printed++; 408 printed++;
209 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 409 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
210 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 410 printk(KERN_ERR "CPU: Your system may be unstable.\n");
211 } 411 }
412
212 c->x86_vendor = X86_VENDOR_UNKNOWN; 413 c->x86_vendor = X86_VENDOR_UNKNOWN;
213 this_cpu = &default_cpu; 414 this_cpu = &default_cpu;
214} 415}
215 416
216 417void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
217static int __init x86_fxsr_setup(char *s)
218{
219 setup_clear_cpu_cap(X86_FEATURE_FXSR);
220 setup_clear_cpu_cap(X86_FEATURE_XMM);
221 return 1;
222}
223__setup("nofxsr", x86_fxsr_setup);
224
225
226static int __init x86_sep_setup(char *s)
227{
228 setup_clear_cpu_cap(X86_FEATURE_SEP);
229 return 1;
230}
231__setup("nosep", x86_sep_setup);
232
233
234/* Standard macro to see if a specific flag is changeable */
235static inline int flag_is_changeable_p(u32 flag)
236{
237 u32 f1, f2;
238
239 asm("pushfl\n\t"
240 "pushfl\n\t"
241 "popl %0\n\t"
242 "movl %0,%1\n\t"
243 "xorl %2,%0\n\t"
244 "pushl %0\n\t"
245 "popfl\n\t"
246 "pushfl\n\t"
247 "popl %0\n\t"
248 "popfl\n\t"
249 : "=&r" (f1), "=&r" (f2)
250 : "ir" (flag));
251
252 return ((f1^f2) & flag) != 0;
253}
254
255
256/* Probe for the CPUID instruction */
257static int __cpuinit have_cpuid_p(void)
258{
259 return flag_is_changeable_p(X86_EFLAGS_ID);
260}
261
262void __init cpu_detect(struct cpuinfo_x86 *c)
263{ 418{
264 /* Get vendor name */ 419 /* Get vendor name */
265 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 420 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -268,48 +423,85 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
268 (unsigned int *)&c->x86_vendor_id[4]); 423 (unsigned int *)&c->x86_vendor_id[4]);
269 424
270 c->x86 = 4; 425 c->x86 = 4;
426 /* Intel-defined flags: level 0x00000001 */
271 if (c->cpuid_level >= 0x00000001) { 427 if (c->cpuid_level >= 0x00000001) {
272 u32 junk, tfms, cap0, misc; 428 u32 junk, tfms, cap0, misc;
273 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 429 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
274 c->x86 = (tfms >> 8) & 15; 430 c->x86 = (tfms >> 8) & 0xf;
275 c->x86_model = (tfms >> 4) & 15; 431 c->x86_model = (tfms >> 4) & 0xf;
432 c->x86_mask = tfms & 0xf;
276 if (c->x86 == 0xf) 433 if (c->x86 == 0xf)
277 c->x86 += (tfms >> 20) & 0xff; 434 c->x86 += (tfms >> 20) & 0xff;
278 if (c->x86 >= 0x6) 435 if (c->x86 >= 0x6)
279 c->x86_model += ((tfms >> 16) & 0xF) << 4; 436 c->x86_model += ((tfms >> 16) & 0xf) << 4;
280 c->x86_mask = tfms & 15;
281 if (cap0 & (1<<19)) { 437 if (cap0 & (1<<19)) {
282 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
283 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 438 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
439 c->x86_cache_alignment = c->x86_clflush_size;
284 } 440 }
285 } 441 }
286} 442}
287static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) 443
444static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
288{ 445{
289 u32 tfms, xlvl; 446 u32 tfms, xlvl;
290 unsigned int ebx; 447 u32 ebx;
291 448
292 memset(&c->x86_capability, 0, sizeof c->x86_capability); 449 /* Intel-defined flags: level 0x00000001 */
293 if (have_cpuid_p()) { 450 if (c->cpuid_level >= 0x00000001) {
294 /* Intel-defined flags: level 0x00000001 */ 451 u32 capability, excap;
295 if (c->cpuid_level >= 0x00000001) { 452 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
296 u32 capability, excap; 453 c->x86_capability[0] = capability;
297 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 454 c->x86_capability[4] = excap;
298 c->x86_capability[0] = capability; 455 }
299 c->x86_capability[4] = excap;
300 }
301 456
302 /* AMD-defined flags: level 0x80000001 */ 457 /* AMD-defined flags: level 0x80000001 */
303 xlvl = cpuid_eax(0x80000000); 458 xlvl = cpuid_eax(0x80000000);
304 if ((xlvl & 0xffff0000) == 0x80000000) { 459 c->extended_cpuid_level = xlvl;
305 if (xlvl >= 0x80000001) { 460 if ((xlvl & 0xffff0000) == 0x80000000) {
306 c->x86_capability[1] = cpuid_edx(0x80000001); 461 if (xlvl >= 0x80000001) {
307 c->x86_capability[6] = cpuid_ecx(0x80000001); 462 c->x86_capability[1] = cpuid_edx(0x80000001);
308 } 463 c->x86_capability[6] = cpuid_ecx(0x80000001);
309 } 464 }
465 }
310 466
467#ifdef CONFIG_X86_64
468 if (c->extended_cpuid_level >= 0x80000008) {
469 u32 eax = cpuid_eax(0x80000008);
470
471 c->x86_virt_bits = (eax >> 8) & 0xff;
472 c->x86_phys_bits = eax & 0xff;
311 } 473 }
474#endif
475
476 if (c->extended_cpuid_level >= 0x80000007)
477 c->x86_power = cpuid_edx(0x80000007);
478
479}
312 480
481static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
482{
483#ifdef CONFIG_X86_32
484 int i;
485
486 /*
487 * First of all, decide if this is a 486 or higher
488 * It's a 486 if we can modify the AC flag
489 */
490 if (flag_is_changeable_p(X86_EFLAGS_AC))
491 c->x86 = 4;
492 else
493 c->x86 = 3;
494
495 for (i = 0; i < X86_VENDOR_NUM; i++)
496 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
497 c->x86_vendor_id[0] = 0;
498 cpu_devs[i]->c_identify(c);
499 if (c->x86_vendor_id[0]) {
500 get_cpu_vendor(c);
501 break;
502 }
503 }
504#endif
313} 505}
314 506
315/* 507/*
@@ -321,25 +513,61 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
321 * WARNING: this function is only called on the BP. Don't add code here 513 * WARNING: this function is only called on the BP. Don't add code here
322 * that is supposed to run on all CPUs. 514 * that is supposed to run on all CPUs.
323 */ 515 */
324static void __init early_cpu_detect(void) 516static void __init early_identify_cpu(struct cpuinfo_x86 *c)
325{ 517{
326 struct cpuinfo_x86 *c = &boot_cpu_data; 518#ifdef CONFIG_X86_64
327 519 c->x86_clflush_size = 64;
328 c->x86_cache_alignment = 32; 520#else
329 c->x86_clflush_size = 32; 521 c->x86_clflush_size = 32;
522#endif
523 c->x86_cache_alignment = c->x86_clflush_size;
524
525 memset(&c->x86_capability, 0, sizeof c->x86_capability);
526 c->extended_cpuid_level = 0;
527
528 if (!have_cpuid_p())
529 identify_cpu_without_cpuid(c);
330 530
531 /* cyrix could have cpuid enabled via c_identify()*/
331 if (!have_cpuid_p()) 532 if (!have_cpuid_p())
332 return; 533 return;
333 534
334 cpu_detect(c); 535 cpu_detect(c);
335 536
336 get_cpu_vendor(c, 1); 537 get_cpu_vendor(c);
337 538
338 early_get_cap(c); 539 get_cpu_cap(c);
339 540
340 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 541 if (this_cpu->c_early_init)
341 cpu_devs[c->x86_vendor]->c_early_init) 542 this_cpu->c_early_init(c);
342 cpu_devs[c->x86_vendor]->c_early_init(c); 543
544 validate_pat_support(c);
545}
546
547void __init early_cpu_init(void)
548{
549 struct cpu_dev **cdev;
550 int count = 0;
551
552 printk("KERNEL supported cpus:\n");
553 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
554 struct cpu_dev *cpudev = *cdev;
555 unsigned int j;
556
557 if (count >= X86_VENDOR_NUM)
558 break;
559 cpu_devs[count] = cpudev;
560 count++;
561
562 for (j = 0; j < 2; j++) {
563 if (!cpudev->c_ident[j])
564 continue;
565 printk(" %s %s\n", cpudev->c_vendor,
566 cpudev->c_ident[j]);
567 }
568 }
569
570 early_identify_cpu(&boot_cpu_data);
343} 571}
344 572
345/* 573/*
@@ -355,117 +583,43 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
355 clear_cpu_cap(c, X86_FEATURE_NOPL); 583 clear_cpu_cap(c, X86_FEATURE_NOPL);
356} 584}
357 585
358/* 586static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
359 * The NOPL instruction is supposed to exist on all CPUs with
360 * family >= 6, unfortunately, that's not true in practice because
361 * of early VIA chips and (more importantly) broken virtualizers that
362 * are not easy to detect. Hence, probe for it based on first
363 * principles.
364 */
365static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
366{ 587{
367 const u32 nopl_signature = 0x888c53b1; /* Random number */ 588 c->extended_cpuid_level = 0;
368 u32 has_nopl = nopl_signature;
369 589
370 clear_cpu_cap(c, X86_FEATURE_NOPL); 590 if (!have_cpuid_p())
371 if (c->x86 >= 6) { 591 identify_cpu_without_cpuid(c);
372 asm volatile("\n"
373 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
374 "2:\n"
375 " .section .fixup,\"ax\"\n"
376 "3: xor %0,%0\n"
377 " jmp 2b\n"
378 " .previous\n"
379 _ASM_EXTABLE(1b,3b)
380 : "+a" (has_nopl));
381
382 if (has_nopl == nopl_signature)
383 set_cpu_cap(c, X86_FEATURE_NOPL);
384 }
385}
386 592
387static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 593 /* cyrix could have cpuid enabled via c_identify()*/
388{ 594 if (!have_cpuid_p())
389 u32 tfms, xlvl; 595 return;
390 unsigned int ebx;
391
392 if (have_cpuid_p()) {
393 /* Get vendor name */
394 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
395 (unsigned int *)&c->x86_vendor_id[0],
396 (unsigned int *)&c->x86_vendor_id[8],
397 (unsigned int *)&c->x86_vendor_id[4]);
398
399 get_cpu_vendor(c, 0);
400 /* Initialize the standard set of capabilities */
401 /* Note that the vendor-specific code below might override */
402 /* Intel-defined flags: level 0x00000001 */
403 if (c->cpuid_level >= 0x00000001) {
404 u32 capability, excap;
405 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
406 c->x86_capability[0] = capability;
407 c->x86_capability[4] = excap;
408 c->x86 = (tfms >> 8) & 15;
409 c->x86_model = (tfms >> 4) & 15;
410 if (c->x86 == 0xf)
411 c->x86 += (tfms >> 20) & 0xff;
412 if (c->x86 >= 0x6)
413 c->x86_model += ((tfms >> 16) & 0xF) << 4;
414 c->x86_mask = tfms & 15;
415 c->initial_apicid = (ebx >> 24) & 0xFF;
416#ifdef CONFIG_X86_HT
417 c->apicid = phys_pkg_id(c->initial_apicid, 0);
418 c->phys_proc_id = c->initial_apicid;
419#else
420 c->apicid = c->initial_apicid;
421#endif
422 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
423 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
424 } else {
425 /* Have CPUID level 0 only - unheard of */
426 c->x86 = 4;
427 }
428 596
429 /* AMD-defined flags: level 0x80000001 */ 597 cpu_detect(c);
430 xlvl = cpuid_eax(0x80000000);
431 if ((xlvl & 0xffff0000) == 0x80000000) {
432 if (xlvl >= 0x80000001) {
433 c->x86_capability[1] = cpuid_edx(0x80000001);
434 c->x86_capability[6] = cpuid_ecx(0x80000001);
435 }
436 if (xlvl >= 0x80000004)
437 get_model_name(c); /* Default name */
438 }
439 598
440 init_scattered_cpuid_features(c); 599 get_cpu_vendor(c);
441 detect_nopl(c);
442 }
443}
444 600
445static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 601 get_cpu_cap(c);
446{
447 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
448 /* Disable processor serial number */
449 unsigned long lo, hi;
450 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
451 lo |= 0x200000;
452 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
453 printk(KERN_NOTICE "CPU serial number disabled.\n");
454 clear_cpu_cap(c, X86_FEATURE_PN);
455 602
456 /* Disabling the serial number may affect the cpuid level */ 603 if (c->cpuid_level >= 0x00000001) {
457 c->cpuid_level = cpuid_eax(0); 604 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
458 } 605#ifdef CONFIG_X86_32
459} 606# ifdef CONFIG_X86_HT
607 c->apicid = phys_pkg_id(c->initial_apicid, 0);
608# else
609 c->apicid = c->initial_apicid;
610# endif
611#endif
460 612
461static int __init x86_serial_nr_setup(char *s) 613#ifdef CONFIG_X86_HT
462{ 614 c->phys_proc_id = c->initial_apicid;
463 disable_x86_serial_nr = 0; 615#endif
464 return 1; 616 }
465}
466__setup("serialnumber", x86_serial_nr_setup);
467 617
618 get_model_name(c); /* Default name */
468 619
620 init_scattered_cpuid_features(c);
621 detect_nopl(c);
622}
469 623
470/* 624/*
471 * This does the hard work of actually picking apart the CPU stuff... 625 * This does the hard work of actually picking apart the CPU stuff...
@@ -477,30 +631,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
477 c->loops_per_jiffy = loops_per_jiffy; 631 c->loops_per_jiffy = loops_per_jiffy;
478 c->x86_cache_size = -1; 632 c->x86_cache_size = -1;
479 c->x86_vendor = X86_VENDOR_UNKNOWN; 633 c->x86_vendor = X86_VENDOR_UNKNOWN;
480 c->cpuid_level = -1; /* CPUID not detected */
481 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 634 c->x86_model = c->x86_mask = 0; /* So far unknown... */
482 c->x86_vendor_id[0] = '\0'; /* Unset */ 635 c->x86_vendor_id[0] = '\0'; /* Unset */
483 c->x86_model_id[0] = '\0'; /* Unset */ 636 c->x86_model_id[0] = '\0'; /* Unset */
484 c->x86_max_cores = 1; 637 c->x86_max_cores = 1;
638 c->x86_coreid_bits = 0;
639#ifdef CONFIG_X86_64
640 c->x86_clflush_size = 64;
641#else
642 c->cpuid_level = -1; /* CPUID not detected */
485 c->x86_clflush_size = 32; 643 c->x86_clflush_size = 32;
644#endif
645 c->x86_cache_alignment = c->x86_clflush_size;
486 memset(&c->x86_capability, 0, sizeof c->x86_capability); 646 memset(&c->x86_capability, 0, sizeof c->x86_capability);
487 647
488 if (!have_cpuid_p()) {
489 /*
490 * First of all, decide if this is a 486 or higher
491 * It's a 486 if we can modify the AC flag
492 */
493 if (flag_is_changeable_p(X86_EFLAGS_AC))
494 c->x86 = 4;
495 else
496 c->x86 = 3;
497 }
498
499 generic_identify(c); 648 generic_identify(c);
500 649
501 if (this_cpu->c_identify) 650 if (this_cpu->c_identify)
502 this_cpu->c_identify(c); 651 this_cpu->c_identify(c);
503 652
653#ifdef CONFIG_X86_64
654 c->apicid = phys_pkg_id(0);
655#endif
656
504 /* 657 /*
505 * Vendor-specific initialization. In this section we 658 * Vendor-specific initialization. In this section we
506 * canonicalize the feature flags, meaning if there are 659 * canonicalize the feature flags, meaning if there are
@@ -534,6 +687,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
534 c->x86, c->x86_model); 687 c->x86, c->x86_model);
535 } 688 }
536 689
690#ifdef CONFIG_X86_64
691 detect_ht(c);
692#endif
693
537 /* 694 /*
538 * On SMP, boot_cpu_data holds the common feature set between 695 * On SMP, boot_cpu_data holds the common feature set between
539 * all CPUs; so make sure that we indicate which features are 696 * all CPUs; so make sure that we indicate which features are
@@ -542,7 +699,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
542 */ 699 */
543 if (c != &boot_cpu_data) { 700 if (c != &boot_cpu_data) {
544 /* AND the already accumulated flags with these */ 701 /* AND the already accumulated flags with these */
545 for (i = 0 ; i < NCAPINTS ; i++) 702 for (i = 0; i < NCAPINTS; i++)
546 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 703 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
547 } 704 }
548 705
@@ -550,72 +707,79 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
550 for (i = 0; i < NCAPINTS; i++) 707 for (i = 0; i < NCAPINTS; i++)
551 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 708 c->x86_capability[i] &= ~cleared_cpu_caps[i];
552 709
710#ifdef CONFIG_X86_MCE
553 /* Init Machine Check Exception if available. */ 711 /* Init Machine Check Exception if available. */
554 mcheck_init(c); 712 mcheck_init(c);
713#endif
555 714
556 select_idle_routine(c); 715 select_idle_routine(c);
716
717#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
718 numa_add_cpu(smp_processor_id());
719#endif
557} 720}
558 721
559void __init identify_boot_cpu(void) 722void __init identify_boot_cpu(void)
560{ 723{
561 identify_cpu(&boot_cpu_data); 724 identify_cpu(&boot_cpu_data);
725#ifdef CONFIG_X86_32
562 sysenter_setup(); 726 sysenter_setup();
563 enable_sep_cpu(); 727 enable_sep_cpu();
728#endif
564} 729}
565 730
566void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 731void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
567{ 732{
568 BUG_ON(c == &boot_cpu_data); 733 BUG_ON(c == &boot_cpu_data);
569 identify_cpu(c); 734 identify_cpu(c);
735#ifdef CONFIG_X86_32
570 enable_sep_cpu(); 736 enable_sep_cpu();
737#endif
571 mtrr_ap_init(); 738 mtrr_ap_init();
572} 739}
573 740
574#ifdef CONFIG_X86_HT 741struct msr_range {
575void __cpuinit detect_ht(struct cpuinfo_x86 *c) 742 unsigned min;
576{ 743 unsigned max;
577 u32 eax, ebx, ecx, edx; 744};
578 int index_msb, core_bits;
579
580 cpuid(1, &eax, &ebx, &ecx, &edx);
581
582 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
583 return;
584
585 smp_num_siblings = (ebx & 0xff0000) >> 16;
586 745
587 if (smp_num_siblings == 1) { 746static struct msr_range msr_range_array[] __cpuinitdata = {
588 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 747 { 0x00000000, 0x00000418},
589 } else if (smp_num_siblings > 1) { 748 { 0xc0000000, 0xc000040b},
749 { 0xc0010000, 0xc0010142},
750 { 0xc0011000, 0xc001103b},
751};
590 752
591 if (smp_num_siblings > NR_CPUS) { 753static void __cpuinit print_cpu_msr(void)
592 printk(KERN_WARNING "CPU: Unsupported number of the " 754{
593 "siblings %d", smp_num_siblings); 755 unsigned index;
594 smp_num_siblings = 1; 756 u64 val;
595 return; 757 int i;
758 unsigned index_min, index_max;
759
760 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
761 index_min = msr_range_array[i].min;
762 index_max = msr_range_array[i].max;
763 for (index = index_min; index < index_max; index++) {
764 if (rdmsrl_amd_safe(index, &val))
765 continue;
766 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
596 } 767 }
768 }
769}
597 770
598 index_msb = get_count_order(smp_num_siblings); 771static int show_msr __cpuinitdata;
599 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 772static __init int setup_show_msr(char *arg)
600 773{
601 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 774 int num;
602 c->phys_proc_id);
603
604 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
605
606 index_msb = get_count_order(smp_num_siblings) ;
607
608 core_bits = get_count_order(c->x86_max_cores);
609 775
610 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 776 get_option(&arg, &num);
611 ((1 << core_bits) - 1);
612 777
613 if (c->x86_max_cores > 1) 778 if (num > 0)
614 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 779 show_msr = num;
615 c->cpu_core_id); 780 return 1;
616 }
617} 781}
618#endif 782__setup("show_msr=", setup_show_msr);
619 783
620static __init int setup_noclflush(char *arg) 784static __init int setup_noclflush(char *arg)
621{ 785{
@@ -634,17 +798,25 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
634 vendor = c->x86_vendor_id; 798 vendor = c->x86_vendor_id;
635 799
636 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 800 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
637 printk("%s ", vendor); 801 printk(KERN_CONT "%s ", vendor);
638 802
639 if (!c->x86_model_id[0]) 803 if (c->x86_model_id[0])
640 printk("%d86", c->x86); 804 printk(KERN_CONT "%s", c->x86_model_id);
641 else 805 else
642 printk("%s", c->x86_model_id); 806 printk(KERN_CONT "%d86", c->x86);
643 807
644 if (c->x86_mask || c->cpuid_level >= 0) 808 if (c->x86_mask || c->cpuid_level >= 0)
645 printk(" stepping %02x\n", c->x86_mask); 809 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
646 else 810 else
647 printk("\n"); 811 printk(KERN_CONT "\n");
812
813#ifdef CONFIG_SMP
814 if (c->cpu_index < show_msr)
815 print_cpu_msr();
816#else
817 if (show_msr)
818 print_cpu_msr();
819#endif
648} 820}
649 821
650static __init int setup_disablecpuid(char *arg) 822static __init int setup_disablecpuid(char *arg)
@@ -660,19 +832,89 @@ __setup("clearcpuid=", setup_disablecpuid);
660 832
661cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 833cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
662 834
663void __init early_cpu_init(void) 835#ifdef CONFIG_X86_64
836struct x8664_pda **_cpu_pda __read_mostly;
837EXPORT_SYMBOL(_cpu_pda);
838
839struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
840
841char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
842
843void __cpuinit pda_init(int cpu)
664{ 844{
665 struct cpu_vendor_dev *cvdev; 845 struct x8664_pda *pda = cpu_pda(cpu);
846
847 /* Setup up data that may be needed in __get_free_pages early */
848 loadsegment(fs, 0);
849 loadsegment(gs, 0);
850 /* Memory clobbers used to order PDA accessed */
851 mb();
852 wrmsrl(MSR_GS_BASE, pda);
853 mb();
854
855 pda->cpunumber = cpu;
856 pda->irqcount = -1;
857 pda->kernelstack = (unsigned long)stack_thread_info() -
858 PDA_STACKOFFSET + THREAD_SIZE;
859 pda->active_mm = &init_mm;
860 pda->mmu_state = 0;
861
862 if (cpu == 0) {
863 /* others are initialized in smpboot.c */
864 pda->pcurrent = &init_task;
865 pda->irqstackptr = boot_cpu_stack;
866 pda->irqstackptr += IRQSTACKSIZE - 64;
867 } else {
868 if (!pda->irqstackptr) {
869 pda->irqstackptr = (char *)
870 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
871 if (!pda->irqstackptr)
872 panic("cannot allocate irqstack for cpu %d",
873 cpu);
874 pda->irqstackptr += IRQSTACKSIZE - 64;
875 }
876
877 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
878 pda->nodenumber = cpu_to_node(cpu);
879 }
880}
666 881
667 for (cvdev = __x86cpuvendor_start ; 882char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
668 cvdev < __x86cpuvendor_end ; 883 DEBUG_STKSZ] __page_aligned_bss;
669 cvdev++)
670 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
671 884
672 early_cpu_detect(); 885extern asmlinkage void ignore_sysret(void);
673 validate_pat_support(&boot_cpu_data); 886
887/* May not be marked __init: used by software suspend */
888void syscall_init(void)
889{
890 /*
891 * LSTAR and STAR live in a bit strange symbiosis.
892 * They both write to the same internal register. STAR allows to
893 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
894 */
895 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
896 wrmsrl(MSR_LSTAR, system_call);
897 wrmsrl(MSR_CSTAR, ignore_sysret);
898
899#ifdef CONFIG_IA32_EMULATION
900 syscall32_cpu_init();
901#endif
902
903 /* Flags to clear on syscall */
904 wrmsrl(MSR_SYSCALL_MASK,
905 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
674} 906}
675 907
908unsigned long kernel_eflags;
909
910/*
911 * Copies of the original ist values from the tss are only accessed during
912 * debugging, no special alignment required.
913 */
914DEFINE_PER_CPU(struct orig_ist, orig_ist);
915
916#else
917
676/* Make sure %fs is initialized properly in idle threads */ 918/* Make sure %fs is initialized properly in idle threads */
677struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 919struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
678{ 920{
@@ -680,25 +922,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
680 regs->fs = __KERNEL_PERCPU; 922 regs->fs = __KERNEL_PERCPU;
681 return regs; 923 return regs;
682} 924}
683 925#endif
684/* Current gdt points %fs at the "master" per-cpu area: after this,
685 * it's on the real one. */
686void switch_to_new_gdt(void)
687{
688 struct desc_ptr gdt_descr;
689
690 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
691 gdt_descr.size = GDT_SIZE - 1;
692 load_gdt(&gdt_descr);
693 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
694}
695 926
696/* 927/*
697 * cpu_init() initializes state that is per-CPU. Some data is already 928 * cpu_init() initializes state that is per-CPU. Some data is already
698 * initialized (naturally) in the bootstrap process, such as the GDT 929 * initialized (naturally) in the bootstrap process, such as the GDT
699 * and IDT. We reload them nevertheless, this function acts as a 930 * and IDT. We reload them nevertheless, this function acts as a
700 * 'CPU state barrier', nothing should get across. 931 * 'CPU state barrier', nothing should get across.
932 * A lot of state is already set up in PDA init for 64 bit
701 */ 933 */
934#ifdef CONFIG_X86_64
935void __cpuinit cpu_init(void)
936{
937 int cpu = stack_smp_processor_id();
938 struct tss_struct *t = &per_cpu(init_tss, cpu);
939 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
940 unsigned long v;
941 char *estacks = NULL;
942 struct task_struct *me;
943 int i;
944
945 /* CPU 0 is initialised in head64.c */
946 if (cpu != 0)
947 pda_init(cpu);
948 else
949 estacks = boot_exception_stacks;
950
951 me = current;
952
953 if (cpu_test_and_set(cpu, cpu_initialized))
954 panic("CPU#%d already initialized!\n", cpu);
955
956 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
957
958 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
959
960 /*
961 * Initialize the per-CPU GDT with the boot GDT,
962 * and set up the GDT descriptor:
963 */
964
965 switch_to_new_gdt();
966 load_idt((const struct desc_ptr *)&idt_descr);
967
968 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
969 syscall_init();
970
971 wrmsrl(MSR_FS_BASE, 0);
972 wrmsrl(MSR_KERNEL_GS_BASE, 0);
973 barrier();
974
975 check_efer();
976 if (cpu != 0 && x2apic)
977 enable_x2apic();
978
979 /*
980 * set up and load the per-CPU TSS
981 */
982 if (!orig_ist->ist[0]) {
983 static const unsigned int order[N_EXCEPTION_STACKS] = {
984 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
985 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
986 };
987 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
988 if (cpu) {
989 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
990 if (!estacks)
991 panic("Cannot allocate exception "
992 "stack %ld %d\n", v, cpu);
993 }
994 estacks += PAGE_SIZE << order[v];
995 orig_ist->ist[v] = t->x86_tss.ist[v] =
996 (unsigned long)estacks;
997 }
998 }
999
1000 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1001 /*
1002 * <= is required because the CPU will access up to
1003 * 8 bits beyond the end of the IO permission bitmap.
1004 */
1005 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1006 t->io_bitmap[i] = ~0UL;
1007
1008 atomic_inc(&init_mm.mm_count);
1009 me->active_mm = &init_mm;
1010 if (me->mm)
1011 BUG();
1012 enter_lazy_tlb(&init_mm, me);
1013
1014 load_sp0(t, &current->thread);
1015 set_tss_desc(cpu, t);
1016 load_TR_desc();
1017 load_LDT(&init_mm.context);
1018
1019#ifdef CONFIG_KGDB
1020 /*
1021 * If the kgdb is connected no debug regs should be altered. This
1022 * is only applicable when KGDB and a KGDB I/O module are built
1023 * into the kernel and you are using early debugging with
1024 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1025 */
1026 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1027 arch_kgdb_ops.correct_hw_break();
1028 else {
1029#endif
1030 /*
1031 * Clear all 6 debug registers:
1032 */
1033
1034 set_debugreg(0UL, 0);
1035 set_debugreg(0UL, 1);
1036 set_debugreg(0UL, 2);
1037 set_debugreg(0UL, 3);
1038 set_debugreg(0UL, 6);
1039 set_debugreg(0UL, 7);
1040#ifdef CONFIG_KGDB
1041 /* If the kgdb is connected no debug regs should be altered. */
1042 }
1043#endif
1044
1045 fpu_init();
1046
1047 raw_local_save_flags(kernel_eflags);
1048
1049 if (is_uv_system())
1050 uv_cpu_init();
1051}
1052
1053#else
1054
702void __cpuinit cpu_init(void) 1055void __cpuinit cpu_init(void)
703{ 1056{
704 int cpu = smp_processor_id(); 1057 int cpu = smp_processor_id();
@@ -779,3 +1132,5 @@ void __cpuinit cpu_uninit(void)
779 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; 1132 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
780} 1133}
781#endif 1134#endif
1135
1136#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index 35d11efdf1fe..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,714 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/delay.h>
11#include <linux/smp.h>
12#include <linux/percpu.h>
13#include <asm/i387.h>
14#include <asm/msr.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h>
18#include <asm/mtrr.h>
19#include <asm/mce.h>
20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
23#ifdef CONFIG_X86_LOCAL_APIC
24#include <asm/mpspec.h>
25#include <asm/apic.h>
26#include <mach_apic.h>
27#endif
28#include <asm/pda.h>
29#include <asm/pgtable.h>
30#include <asm/processor.h>
31#include <asm/desc.h>
32#include <asm/atomic.h>
33#include <asm/proto.h>
34#include <asm/sections.h>
35#include <asm/setup.h>
36#include <asm/genapic.h>
37
38#include "cpu.h"
39
40/* We need valid kernel segments for data and code in long mode too
41 * IRET will check the segment types kkeil 2000/10/28
42 * Also sysret mandates a special GDT layout
43 */
44/* The TLS descriptors are currently at a different place compared to i386.
45 Hopefully nobody expects them at a fixed place (Wine?) */
46DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
47 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
48 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
49 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
50 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
51 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
52 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
53} };
54EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
55
56__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
57
58/* Current gdt points %fs at the "master" per-cpu area: after this,
59 * it's on the real one. */
60void switch_to_new_gdt(void)
61{
62 struct desc_ptr gdt_descr;
63
64 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
65 gdt_descr.size = GDT_SIZE - 1;
66 load_gdt(&gdt_descr);
67}
68
69struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
70
71static void __cpuinit default_init(struct cpuinfo_x86 *c)
72{
73 display_cacheinfo(c);
74}
75
76static struct cpu_dev __cpuinitdata default_cpu = {
77 .c_init = default_init,
78 .c_vendor = "Unknown",
79};
80static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
81
82int __cpuinit get_model_name(struct cpuinfo_x86 *c)
83{
84 unsigned int *v;
85
86 if (c->extended_cpuid_level < 0x80000004)
87 return 0;
88
89 v = (unsigned int *) c->x86_model_id;
90 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
91 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
92 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
93 c->x86_model_id[48] = 0;
94 return 1;
95}
96
97
98void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
99{
100 unsigned int n, dummy, ebx, ecx, edx;
101
102 n = c->extended_cpuid_level;
103
104 if (n >= 0x80000005) {
105 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
106 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
107 "D cache %dK (%d bytes/line)\n",
108 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
109 c->x86_cache_size = (ecx>>24) + (edx>>24);
110 /* On K8 L1 TLB is inclusive, so don't count it */
111 c->x86_tlbsize = 0;
112 }
113
114 if (n >= 0x80000006) {
115 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
116 ecx = cpuid_ecx(0x80000006);
117 c->x86_cache_size = ecx >> 16;
118 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
119
120 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
121 c->x86_cache_size, ecx & 0xFF);
122 }
123}
124
125void __cpuinit detect_ht(struct cpuinfo_x86 *c)
126{
127#ifdef CONFIG_SMP
128 u32 eax, ebx, ecx, edx;
129 int index_msb, core_bits;
130
131 cpuid(1, &eax, &ebx, &ecx, &edx);
132
133
134 if (!cpu_has(c, X86_FEATURE_HT))
135 return;
136 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
137 goto out;
138
139 smp_num_siblings = (ebx & 0xff0000) >> 16;
140
141 if (smp_num_siblings == 1) {
142 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
143 } else if (smp_num_siblings > 1) {
144
145 if (smp_num_siblings > NR_CPUS) {
146 printk(KERN_WARNING "CPU: Unsupported number of "
147 "siblings %d", smp_num_siblings);
148 smp_num_siblings = 1;
149 return;
150 }
151
152 index_msb = get_count_order(smp_num_siblings);
153 c->phys_proc_id = phys_pkg_id(index_msb);
154
155 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
156
157 index_msb = get_count_order(smp_num_siblings);
158
159 core_bits = get_count_order(c->x86_max_cores);
160
161 c->cpu_core_id = phys_pkg_id(index_msb) &
162 ((1 << core_bits) - 1);
163 }
164out:
165 if ((c->x86_max_cores * smp_num_siblings) > 1) {
166 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
167 c->phys_proc_id);
168 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
169 c->cpu_core_id);
170 }
171
172#endif
173}
174
175static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
176{
177 char *v = c->x86_vendor_id;
178 int i;
179 static int printed;
180
181 for (i = 0; i < X86_VENDOR_NUM; i++) {
182 if (cpu_devs[i]) {
183 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
184 (cpu_devs[i]->c_ident[1] &&
185 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
186 c->x86_vendor = i;
187 this_cpu = cpu_devs[i];
188 return;
189 }
190 }
191 }
192 if (!printed) {
193 printed++;
194 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
195 printk(KERN_ERR "CPU: Your system may be unstable.\n");
196 }
197 c->x86_vendor = X86_VENDOR_UNKNOWN;
198}
199
200static void __init early_cpu_support_print(void)
201{
202 int i,j;
203 struct cpu_dev *cpu_devx;
204
205 printk("KERNEL supported cpus:\n");
206 for (i = 0; i < X86_VENDOR_NUM; i++) {
207 cpu_devx = cpu_devs[i];
208 if (!cpu_devx)
209 continue;
210 for (j = 0; j < 2; j++) {
211 if (!cpu_devx->c_ident[j])
212 continue;
213 printk(" %s %s\n", cpu_devx->c_vendor,
214 cpu_devx->c_ident[j]);
215 }
216 }
217}
218
219/*
220 * The NOPL instruction is supposed to exist on all CPUs with
221 * family >= 6, unfortunately, that's not true in practice because
222 * of early VIA chips and (more importantly) broken virtualizers that
223 * are not easy to detect. Hence, probe for it based on first
224 * principles.
225 *
226 * Note: no 64-bit chip is known to lack these, but put the code here
227 * for consistency with 32 bits, and to make it utterly trivial to
228 * diagnose the problem should it ever surface.
229 */
230static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
231{
232 const u32 nopl_signature = 0x888c53b1; /* Random number */
233 u32 has_nopl = nopl_signature;
234
235 clear_cpu_cap(c, X86_FEATURE_NOPL);
236 if (c->x86 >= 6) {
237 asm volatile("\n"
238 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
239 "2:\n"
240 " .section .fixup,\"ax\"\n"
241 "3: xor %0,%0\n"
242 " jmp 2b\n"
243 " .previous\n"
244 _ASM_EXTABLE(1b,3b)
245 : "+a" (has_nopl));
246
247 if (has_nopl == nopl_signature)
248 set_cpu_cap(c, X86_FEATURE_NOPL);
249 }
250}
251
252static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
253
254void __init early_cpu_init(void)
255{
256 struct cpu_vendor_dev *cvdev;
257
258 for (cvdev = __x86cpuvendor_start ;
259 cvdev < __x86cpuvendor_end ;
260 cvdev++)
261 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
262 early_cpu_support_print();
263 early_identify_cpu(&boot_cpu_data);
264}
265
266/* Do some early cpuid on the boot CPU to get some parameter that are
267 needed before check_bugs. Everything advanced is in identify_cpu
268 below. */
269static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
270{
271 u32 tfms, xlvl;
272
273 c->loops_per_jiffy = loops_per_jiffy;
274 c->x86_cache_size = -1;
275 c->x86_vendor = X86_VENDOR_UNKNOWN;
276 c->x86_model = c->x86_mask = 0; /* So far unknown... */
277 c->x86_vendor_id[0] = '\0'; /* Unset */
278 c->x86_model_id[0] = '\0'; /* Unset */
279 c->x86_clflush_size = 64;
280 c->x86_cache_alignment = c->x86_clflush_size;
281 c->x86_max_cores = 1;
282 c->x86_coreid_bits = 0;
283 c->extended_cpuid_level = 0;
284 memset(&c->x86_capability, 0, sizeof c->x86_capability);
285
286 /* Get vendor name */
287 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
288 (unsigned int *)&c->x86_vendor_id[0],
289 (unsigned int *)&c->x86_vendor_id[8],
290 (unsigned int *)&c->x86_vendor_id[4]);
291
292 get_cpu_vendor(c);
293
294 /* Initialize the standard set of capabilities */
295 /* Note that the vendor-specific code below might override */
296
297 /* Intel-defined flags: level 0x00000001 */
298 if (c->cpuid_level >= 0x00000001) {
299 __u32 misc;
300 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
301 &c->x86_capability[0]);
302 c->x86 = (tfms >> 8) & 0xf;
303 c->x86_model = (tfms >> 4) & 0xf;
304 c->x86_mask = tfms & 0xf;
305 if (c->x86 == 0xf)
306 c->x86 += (tfms >> 20) & 0xff;
307 if (c->x86 >= 0x6)
308 c->x86_model += ((tfms >> 16) & 0xF) << 4;
309 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
310 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
311 } else {
312 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4;
314 }
315
316 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
317#ifdef CONFIG_SMP
318 c->phys_proc_id = c->initial_apicid;
319#endif
320 /* AMD-defined flags: level 0x80000001 */
321 xlvl = cpuid_eax(0x80000000);
322 c->extended_cpuid_level = xlvl;
323 if ((xlvl & 0xffff0000) == 0x80000000) {
324 if (xlvl >= 0x80000001) {
325 c->x86_capability[1] = cpuid_edx(0x80000001);
326 c->x86_capability[6] = cpuid_ecx(0x80000001);
327 }
328 if (xlvl >= 0x80000004)
329 get_model_name(c); /* Default name */
330 }
331
332 /* Transmeta-defined flags: level 0x80860001 */
333 xlvl = cpuid_eax(0x80860000);
334 if ((xlvl & 0xffff0000) == 0x80860000) {
335 /* Don't set x86_cpuid_level here for now to not confuse. */
336 if (xlvl >= 0x80860001)
337 c->x86_capability[2] = cpuid_edx(0x80860001);
338 }
339
340 if (c->extended_cpuid_level >= 0x80000007)
341 c->x86_power = cpuid_edx(0x80000007);
342
343 if (c->extended_cpuid_level >= 0x80000008) {
344 u32 eax = cpuid_eax(0x80000008);
345
346 c->x86_virt_bits = (eax >> 8) & 0xff;
347 c->x86_phys_bits = eax & 0xff;
348 }
349
350 detect_nopl(c);
351
352 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
353 cpu_devs[c->x86_vendor]->c_early_init)
354 cpu_devs[c->x86_vendor]->c_early_init(c);
355
356 validate_pat_support(c);
357}
358
359/*
360 * This does the hard work of actually picking apart the CPU stuff...
361 */
362static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
363{
364 int i;
365
366 early_identify_cpu(c);
367
368 init_scattered_cpuid_features(c);
369
370 c->apicid = phys_pkg_id(0);
371
372 /*
373 * Vendor-specific initialization. In this section we
374 * canonicalize the feature flags, meaning if there are
375 * features a certain CPU supports which CPUID doesn't
376 * tell us, CPUID claiming incorrect flags, or other bugs,
377 * we handle them here.
378 *
379 * At the end of this section, c->x86_capability better
380 * indicate the features this CPU genuinely supports!
381 */
382 if (this_cpu->c_init)
383 this_cpu->c_init(c);
384
385 detect_ht(c);
386
387 /*
388 * On SMP, boot_cpu_data holds the common feature set between
389 * all CPUs; so make sure that we indicate which features are
390 * common between the CPUs. The first time this routine gets
391 * executed, c == &boot_cpu_data.
392 */
393 if (c != &boot_cpu_data) {
394 /* AND the already accumulated flags with these */
395 for (i = 0; i < NCAPINTS; i++)
396 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
397 }
398
399 /* Clear all flags overriden by options */
400 for (i = 0; i < NCAPINTS; i++)
401 c->x86_capability[i] &= ~cleared_cpu_caps[i];
402
403#ifdef CONFIG_X86_MCE
404 mcheck_init(c);
405#endif
406 select_idle_routine(c);
407
408#ifdef CONFIG_NUMA
409 numa_add_cpu(smp_processor_id());
410#endif
411
412}
413
414void __cpuinit identify_boot_cpu(void)
415{
416 identify_cpu(&boot_cpu_data);
417}
418
419void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
420{
421 BUG_ON(c == &boot_cpu_data);
422 identify_cpu(c);
423 mtrr_ap_init();
424}
425
426static __init int setup_noclflush(char *arg)
427{
428 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
429 return 1;
430}
431__setup("noclflush", setup_noclflush);
432
433void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
434{
435 if (c->x86_model_id[0])
436 printk(KERN_CONT "%s", c->x86_model_id);
437
438 if (c->x86_mask || c->cpuid_level >= 0)
439 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
440 else
441 printk(KERN_CONT "\n");
442}
443
444static __init int setup_disablecpuid(char *arg)
445{
446 int bit;
447 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
448 setup_clear_cpu_cap(bit);
449 else
450 return 0;
451 return 1;
452}
453__setup("clearcpuid=", setup_disablecpuid);
454
455cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
456
457struct x8664_pda **_cpu_pda __read_mostly;
458EXPORT_SYMBOL(_cpu_pda);
459
460struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
461
462char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
463
464unsigned long __supported_pte_mask __read_mostly = ~0UL;
465EXPORT_SYMBOL_GPL(__supported_pte_mask);
466
467static int do_not_nx __cpuinitdata;
468
469/* noexec=on|off
470Control non executable mappings for 64bit processes.
471
472on Enable(default)
473off Disable
474*/
475static int __init nonx_setup(char *str)
476{
477 if (!str)
478 return -EINVAL;
479 if (!strncmp(str, "on", 2)) {
480 __supported_pte_mask |= _PAGE_NX;
481 do_not_nx = 0;
482 } else if (!strncmp(str, "off", 3)) {
483 do_not_nx = 1;
484 __supported_pte_mask &= ~_PAGE_NX;
485 }
486 return 0;
487}
488early_param("noexec", nonx_setup);
489
490int force_personality32;
491
492/* noexec32=on|off
493Control non executable heap for 32bit processes.
494To control the stack too use noexec=off
495
496on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
497off PROT_READ implies PROT_EXEC
498*/
499static int __init nonx32_setup(char *str)
500{
501 if (!strcmp(str, "on"))
502 force_personality32 &= ~READ_IMPLIES_EXEC;
503 else if (!strcmp(str, "off"))
504 force_personality32 |= READ_IMPLIES_EXEC;
505 return 1;
506}
507__setup("noexec32=", nonx32_setup);
508
509void pda_init(int cpu)
510{
511 struct x8664_pda *pda = cpu_pda(cpu);
512
513 /* Setup up data that may be needed in __get_free_pages early */
514 loadsegment(fs, 0);
515 loadsegment(gs, 0);
516 /* Memory clobbers used to order PDA accessed */
517 mb();
518 wrmsrl(MSR_GS_BASE, pda);
519 mb();
520
521 pda->cpunumber = cpu;
522 pda->irqcount = -1;
523 pda->kernelstack = (unsigned long)stack_thread_info() -
524 PDA_STACKOFFSET + THREAD_SIZE;
525 pda->active_mm = &init_mm;
526 pda->mmu_state = 0;
527
528 if (cpu == 0) {
529 /* others are initialized in smpboot.c */
530 pda->pcurrent = &init_task;
531 pda->irqstackptr = boot_cpu_stack;
532 pda->irqstackptr += IRQSTACKSIZE - 64;
533 } else {
534 if (!pda->irqstackptr) {
535 pda->irqstackptr = (char *)
536 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
537 if (!pda->irqstackptr)
538 panic("cannot allocate irqstack for cpu %d",
539 cpu);
540 pda->irqstackptr += IRQSTACKSIZE - 64;
541 }
542
543 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
544 pda->nodenumber = cpu_to_node(cpu);
545 }
546}
547
548char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
549 DEBUG_STKSZ] __page_aligned_bss;
550
551extern asmlinkage void ignore_sysret(void);
552
553/* May not be marked __init: used by software suspend */
554void syscall_init(void)
555{
556 /*
557 * LSTAR and STAR live in a bit strange symbiosis.
558 * They both write to the same internal register. STAR allows to
559 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
560 */
561 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
562 wrmsrl(MSR_LSTAR, system_call);
563 wrmsrl(MSR_CSTAR, ignore_sysret);
564
565#ifdef CONFIG_IA32_EMULATION
566 syscall32_cpu_init();
567#endif
568
569 /* Flags to clear on syscall */
570 wrmsrl(MSR_SYSCALL_MASK,
571 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
572}
573
574void __cpuinit check_efer(void)
575{
576 unsigned long efer;
577
578 rdmsrl(MSR_EFER, efer);
579 if (!(efer & EFER_NX) || do_not_nx)
580 __supported_pte_mask &= ~_PAGE_NX;
581}
582
583unsigned long kernel_eflags;
584
585/*
586 * Copies of the original ist values from the tss are only accessed during
587 * debugging, no special alignment required.
588 */
589DEFINE_PER_CPU(struct orig_ist, orig_ist);
590
591/*
592 * cpu_init() initializes state that is per-CPU. Some data is already
593 * initialized (naturally) in the bootstrap process, such as the GDT
594 * and IDT. We reload them nevertheless, this function acts as a
595 * 'CPU state barrier', nothing should get across.
596 * A lot of state is already set up in PDA init.
597 */
598void __cpuinit cpu_init(void)
599{
600 int cpu = stack_smp_processor_id();
601 struct tss_struct *t = &per_cpu(init_tss, cpu);
602 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
603 unsigned long v;
604 char *estacks = NULL;
605 struct task_struct *me;
606 int i;
607
608 /* CPU 0 is initialised in head64.c */
609 if (cpu != 0)
610 pda_init(cpu);
611 else
612 estacks = boot_exception_stacks;
613
614 me = current;
615
616 if (cpu_test_and_set(cpu, cpu_initialized))
617 panic("CPU#%d already initialized!\n", cpu);
618
619 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
620
621 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
622
623 /*
624 * Initialize the per-CPU GDT with the boot GDT,
625 * and set up the GDT descriptor:
626 */
627
628 switch_to_new_gdt();
629 load_idt((const struct desc_ptr *)&idt_descr);
630
631 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
632 syscall_init();
633
634 wrmsrl(MSR_FS_BASE, 0);
635 wrmsrl(MSR_KERNEL_GS_BASE, 0);
636 barrier();
637
638 check_efer();
639 if (cpu != 0 && x2apic)
640 enable_x2apic();
641
642 /*
643 * set up and load the per-CPU TSS
644 */
645 if (!orig_ist->ist[0]) {
646 static const unsigned int order[N_EXCEPTION_STACKS] = {
647 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
648 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
649 };
650 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
651 if (cpu) {
652 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
653 if (!estacks)
654 panic("Cannot allocate exception "
655 "stack %ld %d\n", v, cpu);
656 }
657 estacks += PAGE_SIZE << order[v];
658 orig_ist->ist[v] = t->x86_tss.ist[v] =
659 (unsigned long)estacks;
660 }
661 }
662
663 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
664 /*
665 * <= is required because the CPU will access up to
666 * 8 bits beyond the end of the IO permission bitmap.
667 */
668 for (i = 0; i <= IO_BITMAP_LONGS; i++)
669 t->io_bitmap[i] = ~0UL;
670
671 atomic_inc(&init_mm.mm_count);
672 me->active_mm = &init_mm;
673 if (me->mm)
674 BUG();
675 enter_lazy_tlb(&init_mm, me);
676
677 load_sp0(t, &current->thread);
678 set_tss_desc(cpu, t);
679 load_TR_desc();
680 load_LDT(&init_mm.context);
681
682#ifdef CONFIG_KGDB
683 /*
684 * If the kgdb is connected no debug regs should be altered. This
685 * is only applicable when KGDB and a KGDB I/O module are built
686 * into the kernel and you are using early debugging with
687 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
688 */
689 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
690 arch_kgdb_ops.correct_hw_break();
691 else {
692#endif
693 /*
694 * Clear all 6 debug registers:
695 */
696
697 set_debugreg(0UL, 0);
698 set_debugreg(0UL, 1);
699 set_debugreg(0UL, 2);
700 set_debugreg(0UL, 3);
701 set_debugreg(0UL, 6);
702 set_debugreg(0UL, 7);
703#ifdef CONFIG_KGDB
704 /* If the kgdb is connected no debug regs should be altered. */
705 }
706#endif
707
708 fpu_init();
709
710 raw_local_save_flags(kernel_eflags);
711
712 if (is_uv_system())
713 uv_cpu_init();
714}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565fe..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 * c);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 * c);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
24 int c_x86_vendor;
24}; 25};
25 26
26extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX;
27 31
28struct cpu_vendor_dev { 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
29 int vendor;
30 struct cpu_dev *cpu_dev;
31};
32
33#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
34 static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
35 __attribute__((__section__(".x86cpuvendor.init"))) = \
36 { cpu_vendor_id, cpu_dev }
37
38extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
39 33
40extern int get_model_name(struct cpuinfo_x86 *c);
41extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
42 35
43#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd097b835839..c24c4a487b7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
257 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
258 */ 258 */
259static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
260{ 261{
261 union { 262 union {
262 struct { 263 struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
326 327
327#endif 328#endif
328 329
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
330 331
331 put_cpu(); 332 put_cpu();
332 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void)
785 if (ret) 786 if (ret)
786 return ret; 787 return ret;
787 788
788 return cpufreq_register_driver(&acpi_cpufreq_driver); 789 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
790 if (ret)
791 free_percpu(acpi_perf_data);
792
793 return ret;
789} 794}
790 795
791static void __exit acpi_cpufreq_exit(void) 796static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void)
795 cpufreq_unregister_driver(&acpi_cpufreq_driver); 800 cpufreq_unregister_driver(&acpi_cpufreq_driver);
796 801
797 free_percpu(acpi_perf_data); 802 free_percpu(acpi_perf_data);
798
799 return;
800} 803}
801 804
802module_param(acpi_pstate_strict, uint, 0644); 805module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index e4a4bf870e94..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fbd..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..b5ced806a316 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 15e13c01cc36..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) \ 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 13f8fa16b815..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -301,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
301 */ 301 */
302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
303 geode_configure(); 303 geode_configure();
304 get_model_name(c); /* get CPU marketing name */
305 return; 304 return;
306 } else { /* MediaGX */ 305 } else { /* MediaGX */
307 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -442,14 +441,16 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
442 .c_early_init = early_init_cyrix, 441 .c_early_init = early_init_cyrix,
443 .c_init = init_cyrix, 442 .c_init = init_cyrix,
444 .c_identify = cyrix_identify, 443 .c_identify = cyrix_identify,
444 .c_x86_vendor = X86_VENDOR_CYRIX,
445}; 445};
446 446
447cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
448 448
449static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
450 .c_vendor = "NSC", 450 .c_vendor = "NSC",
451 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
452 .c_init = init_nsc, 452 .c_init = init_nsc,
453 .c_x86_vendor = X86_VENDOR_NSC,
453}; 454};
454 455
455cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); 456cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 77618c717d76..99468dbd08da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -25,14 +30,20 @@
25 30
26static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
27{ 32{
28 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
29 if (c->x86 == 15 && c->x86_cache_alignment == 64)
30 c->x86_cache_alignment = 128;
31 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
32 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
33 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
34} 44}
35 45
46#ifdef CONFIG_X86_32
36/* 47/*
37 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
38 * 49 *
@@ -52,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
52 return 0; 63 return 0;
53} 64}
54 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
55 70
56/* 71 /*
57 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
58 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
59 */ 74 */
60static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
61{ 81{
62 unsigned long lo, hi; 82 unsigned long lo, hi;
63 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
64 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
65 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
66 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -70,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
70 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
71 } 121 }
72 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
73} 156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
160}
161#endif
74 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
75 180
76/* 181/*
77 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
78 */ 183 */
79static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
80{ 185{
81 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
82 187
@@ -91,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
91 return 1; 196 return 1;
92} 197}
93 198
94#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
95static void __cpuinit trap_init_f00f_bug(void)
96{ 200{
97 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
98 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
99 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
100 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
101 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
102 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
103 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
104 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
105} 235}
106#endif
107 236
108static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
109{ 238{
110 unsigned int l2 = 0; 239 unsigned int l2 = 0;
111 char *p = NULL;
112 240
113 early_init_intel(c); 241 early_init_intel(c);
114 242
115#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
116 /*
117 * All current models of Pentium and Pentium with MMX technology CPUs
118 * have the F0 0F bug, which lets nonprivileged users lock up the system.
119 * Note that the workaround only should be initialized once...
120 */
121 c->f00f_bug = 0;
122 if (!paravirt_enabled() && c->x86 == 5) {
123 static int f00f_workaround_enabled;
124
125 c->f00f_bug = 1;
126 if (!f00f_workaround_enabled) {
127 trap_init_f00f_bug();
128 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
129 f00f_workaround_enabled = 1;
130 }
131 }
132#endif
133 244
134 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
135 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -139,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
139 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
140 } 251 }
141 252
142 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
143 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
144 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
145 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
146 /* 271 /*
147 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
148 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
149 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
150 */ 275 */
151 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
152 switch (c->x86_model) { 279 switch (c->x86_model) {
153 case 5: 280 case 5:
154 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -171,70 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
171 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
172 break; 299 break;
173 } 300 }
174 }
175
176 if (p)
177 strcpy(c->x86_model_id, p);
178 301
179 c->x86_max_cores = num_cpu_cores(c); 302 if (p)
180 303 strcpy(c->x86_model_id, p);
181 detect_ht(c);
182
183 /* Work around errata */
184 Intel_errata_workarounds(c);
185
186#ifdef CONFIG_X86_INTEL_USERCOPY
187 /*
188 * Set up the preferred alignment for movsl bulk memory moves
189 */
190 switch (c->x86) {
191 case 4: /* 486: untested */
192 break;
193 case 5: /* Old Pentia: untested */
194 break;
195 case 6: /* PII/PIII only like movsl with 8-byte alignment */
196 movsl_mask.mask = 7;
197 break;
198 case 15: /* P4 is OK down to 8-byte alignment */
199 movsl_mask.mask = 7;
200 break;
201 } 304 }
202#endif
203 305
204 if (cpu_has_xmm2) 306 if (c->x86 == 15)
205 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
206 if (c->x86 == 15) {
207 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
208 }
209 if (c->x86 == 6) 308 if (c->x86 == 6)
210 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
211 if (cpu_has_ds) {
212 unsigned int l1;
213 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
214 if (!(l1 & (1<<11)))
215 set_cpu_cap(c, X86_FEATURE_BTS);
216 if (!(l1 & (1<<12)))
217 set_cpu_cap(c, X86_FEATURE_PEBS);
218 }
219 310
220 if (cpu_has_bts) 311 if (cpu_has_bts)
221 ds_init_intel(c); 312 ptrace_bts_init_intel(c);
222 313
223 /* 314#endif
224 * See if we have a good local APIC by checking for buggy Pentia,
225 * i.e. all B steppings and the C2 stepping of P54C when using their
226 * integrated APIC (see 11AP erratum in "Pentium Processor
227 * Specification Update").
228 */
229 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
230 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
231 set_cpu_cap(c, X86_FEATURE_11AP);
232 315
233#ifdef CONFIG_X86_NUMAQ 316 detect_extended_topology(c);
234 numaq_tsc_disable(); 317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
320 * detection.
321 */
322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
324 detect_ht(c);
235#endif 325#endif
326 }
327
328 /* Work around errata */
329 srat_detect_node();
330
331 if (cpu_has(c, X86_FEATURE_VMX))
332 detect_vmx_virtcap(c);
236} 333}
237 334
335#ifdef CONFIG_X86_32
238static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
239{ 337{
240 /* 338 /*
@@ -247,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
247 size = 256; 345 size = 256;
248 return size; 346 return size;
249} 347}
348#endif
250 349
251static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
252 .c_vendor = "Intel", 351 .c_vendor = "Intel",
253 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
254 .c_models = { 354 .c_models = {
255 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
256 { 356 {
@@ -300,12 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
300 } 400 }
301 }, 401 },
302 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
303 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
304 .c_init = init_intel, 406 .c_init = init_intel,
305 .c_size_cache = intel_size_cache, 407 .c_x86_vendor = X86_VENDOR_INTEL,
306}; 408};
307 409
308cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
309
310/* arch_initcall(intel_cpu_init); */
311 411
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f0..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83 c->x86_max_cores = intel_num_cpu_cores(c);
84
85 srat_detect_node();
86}
87
88static struct cpu_dev intel_cpu_dev __cpuinitdata = {
89 .c_vendor = "Intel",
90 .c_ident = { "GenuineIntel" },
91 .c_early_init = early_init_intel,
92 .c_init = init_intel,
93};
94cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
95
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 726a5fcdf341..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -860,7 +860,7 @@ error:
860 return err; 860 return err;
861} 861}
862 862
863static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
864{ 864{
865 int i; 865 int i;
866 866
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80eb..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
401 tmp |= ~((1<<(hi - 1)) - 1); 401 tmp |= ~((1<<(hi - 1)) - 1);
402 402
403 if (tmp != mask_lo) { 403 if (tmp != mask_lo) {
404 static int once = 1; 404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405
406 if (once) {
407 printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
408 once = 0;
409 }
410 mask_lo = tmp; 405 mask_lo = tmp;
411 } 406 }
412 } 407 }
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 58ac5d3d4361..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4ff..6bff382094f5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 295 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 296 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 298
299 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 300 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 301 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 302 wd->cccr_msr = 0; /* unused */
303
304 /* ok, everything is initialized, announce that we're set */
305 cpu_nmi_set_wd_enabled();
306
307 apic_write(APIC_LVTPC, APIC_DM_NMI);
308 evntsel |= K7_EVNTSEL_ENABLE;
309 wrmsr(evntsel_msr, evntsel, 0);
310
305 return 1; 311 return 1;
306} 312}
307 313
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 385 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 386 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 387 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 388
389 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 390 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 391 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 392 wd->cccr_msr = 0; /* unused */
393
394 /* ok, everything is initialized, announce that we're set */
395 cpu_nmi_set_wd_enabled();
396
397 apic_write(APIC_LVTPC, APIC_DM_NMI);
398 evntsel |= P6_EVNTSEL0_ENABLE;
399 wrmsr(evntsel_msr, evntsel, 0);
400
389 return 1; 401 return 1;
390} 402}
391 403
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 444#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 445#define P4_CCCR_OVF (1 << 31)
434 446
447#define P4_CONTROLS 18
448static unsigned int p4_controls[18] = {
449 MSR_P4_BPU_CCCR0,
450 MSR_P4_BPU_CCCR1,
451 MSR_P4_BPU_CCCR2,
452 MSR_P4_BPU_CCCR3,
453 MSR_P4_MS_CCCR0,
454 MSR_P4_MS_CCCR1,
455 MSR_P4_MS_CCCR2,
456 MSR_P4_MS_CCCR3,
457 MSR_P4_FLAME_CCCR0,
458 MSR_P4_FLAME_CCCR1,
459 MSR_P4_FLAME_CCCR2,
460 MSR_P4_FLAME_CCCR3,
461 MSR_P4_IQ_CCCR0,
462 MSR_P4_IQ_CCCR1,
463 MSR_P4_IQ_CCCR2,
464 MSR_P4_IQ_CCCR3,
465 MSR_P4_IQ_CCCR4,
466 MSR_P4_IQ_CCCR5,
467};
435/* 468/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 469 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 470 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 506 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 507 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 508 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
509
510 /*
511 * If we're on the kdump kernel or other situation, we may
512 * still have other performance counter registers set to
513 * interrupt and they'll keep interrupting forever because
514 * of the P4_CCCR_OVF quirk. So we need to ACK all the
515 * pending interrupts and disable all the registers here,
516 * before reenabling the NMI delivery. Refer to p4_rearm()
517 * about the P4_CCCR_OVF quirk.
518 */
519 if (reset_devices) {
520 unsigned int low, high;
521 int i;
522
523 for (i = 0; i < P4_CONTROLS; i++) {
524 rdmsr(p4_controls[i], low, high);
525 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
526 wrmsr(p4_controls[i], low, high);
527 }
528 }
476 } else { 529 } else {
477 /* logical cpu 1 */ 530 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 531 perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
499 wrmsr(evntsel_msr, evntsel, 0); 552 wrmsr(evntsel_msr, evntsel, 0);
500 wrmsr(cccr_msr, cccr_val, 0); 553 wrmsr(cccr_msr, cccr_val, 0);
501 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 554 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
502 apic_write(APIC_LVTPC, APIC_DM_NMI); 555
503 cccr_val |= P4_CCCR_ENABLE;
504 wrmsr(cccr_msr, cccr_val, 0);
505 wd->perfctr_msr = perfctr_msr; 556 wd->perfctr_msr = perfctr_msr;
506 wd->evntsel_msr = evntsel_msr; 557 wd->evntsel_msr = evntsel_msr;
507 wd->cccr_msr = cccr_msr; 558 wd->cccr_msr = cccr_msr;
559
560 /* ok, everything is initialized, announce that we're set */
561 cpu_nmi_set_wd_enabled();
562
563 apic_write(APIC_LVTPC, APIC_DM_NMI);
564 cccr_val |= P4_CCCR_ENABLE;
565 wrmsr(cccr_msr, cccr_val, 0);
508 return 1; 566 return 1;
509} 567}
510 568
@@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
620 wrmsr(evntsel_msr, evntsel, 0); 678 wrmsr(evntsel_msr, evntsel, 0);
621 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 679 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
622 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 680 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
623 apic_write(APIC_LVTPC, APIC_DM_NMI);
624 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
625 wrmsr(evntsel_msr, evntsel, 0);
626 681
627 wd->perfctr_msr = perfctr_msr; 682 wd->perfctr_msr = perfctr_msr;
628 wd->evntsel_msr = evntsel_msr; 683 wd->evntsel_msr = evntsel_msr;
629 wd->cccr_msr = 0; /* unused */ 684 wd->cccr_msr = 0; /* unused */
685
686 /* ok, everything is initialized, announce that we're set */
687 cpu_nmi_set_wd_enabled();
688
689 apic_write(APIC_LVTPC, APIC_DM_NMI);
690 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
691 wrmsr(evntsel_msr, evntsel, 0);
630 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 692 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
631 return 1; 693 return 1;
632} 694}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8f..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
105}; 107};
106 108
107cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); 109cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d75..e777f79e0960 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
19 } 19 }
20 }, 20 },
21 }, 21 },
22 .c_x86_vendor = X86_VENDOR_UMC,
22}; 23};
23 24
24cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); 25cpu_dev_register(umc_cpu_dev);
25 26
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index d3e524c84527..e90a60ef10c2 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -32,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
32 return 0; 32 return 0;
33 33
34 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
35 if (!vaddr)
36 return -ENOMEM;
35 37
36 if (userbuf) { 38 if (userbuf) {
37 if (copy_to_user(buf, (vaddr + offset), csize)) { 39 if (copy_to_user(buf, vaddr + offset, csize)) {
38 iounmap(vaddr); 40 iounmap(vaddr);
39 return -EFAULT; 41 return -EFAULT;
40 } 42 }
41 } else 43 } else
42 memcpy(buf, (vaddr + offset), csize); 44 memcpy(buf, vaddr + offset, csize);
43 45
44 iounmap(vaddr); 46 iounmap(vaddr);
45 return csize; 47 return csize;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 66e48aa2dd1b..78e642feac30 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
148 case E820_NVS: 148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n"); 149 printk(KERN_CONT "(ACPI NVS)\n");
150 break; 150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
151 default: 154 default:
152 printk(KERN_CONT "type %u\n", e820.map[i].type); 155 printk(KERN_CONT "type %u\n", e820.map[i].type);
153 break; 156 break;
@@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
1260 case E820_RAM: return "System RAM"; 1263 case E820_RAM: return "System RAM";
1261 case E820_ACPI: return "ACPI Tables"; 1264 case E820_ACPI: return "ACPI Tables";
1262 case E820_NVS: return "ACPI Non-volatile Storage"; 1265 case E820_NVS: return "ACPI Non-volatile Storage";
1266 case E820_UNUSABLE: return "Unusable memory";
1263 default: return "reserved"; 1267 default: return "reserved";
1264 } 1268 }
1265} 1269}
@@ -1267,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
1267/* 1271/*
1268 * Mark e820 reserved areas as busy for the resource manager. 1272 * Mark e820 reserved areas as busy for the resource manager.
1269 */ 1273 */
1274static struct resource __initdata *e820_res;
1270void __init e820_reserve_resources(void) 1275void __init e820_reserve_resources(void)
1271{ 1276{
1272 int i; 1277 int i;
@@ -1274,6 +1279,7 @@ void __init e820_reserve_resources(void)
1274 u64 end; 1279 u64 end;
1275 1280
1276 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1281 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1282 e820_res = res;
1277 for (i = 0; i < e820.nr_map; i++) { 1283 for (i = 0; i < e820.nr_map; i++) {
1278 end = e820.map[i].addr + e820.map[i].size - 1; 1284 end = e820.map[i].addr + e820.map[i].size - 1;
1279#ifndef CONFIG_RESOURCES_64BIT 1285#ifndef CONFIG_RESOURCES_64BIT
@@ -1287,7 +1293,14 @@ void __init e820_reserve_resources(void)
1287 res->end = end; 1293 res->end = end;
1288 1294
1289 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 1295 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1290 insert_resource(&iomem_resource, res); 1296
1297 /*
1298 * don't register the region that could be conflicted with
1299 * pci device BAR resource and insert them later in
1300 * pcibios_resource_survey()
1301 */
1302 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
1303 insert_resource(&iomem_resource, res);
1291 res++; 1304 res++;
1292 } 1305 }
1293 1306
@@ -1299,6 +1312,19 @@ void __init e820_reserve_resources(void)
1299 } 1312 }
1300} 1313}
1301 1314
1315void __init e820_reserve_resources_late(void)
1316{
1317 int i;
1318 struct resource *res;
1319
1320 res = e820_res;
1321 for (i = 0; i < e820.nr_map; i++) {
1322 if (!res->parent && res->end)
1323 reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
1324 res++;
1325 }
1326}
1327
1302char *__init default_machine_specific_memory_setup(void) 1328char *__init default_machine_specific_memory_setup(void)
1303{ 1329{
1304 char *who = "BIOS-e820"; 1330 char *who = "BIOS-e820";
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fac..24bb5faf5efa 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,20 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98#ifdef CONFIG_DMAR
99static void __init intel_g33_dmar(int num, int slot, int func)
100{
101 struct acpi_table_header *dmar_tbl;
102 acpi_status status;
103
104 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
105 if (ACPI_SUCCESS(status)) {
106 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
107 dmar_disabled = 1;
108 }
109}
110#endif
111
98#define QFLAG_APPLY_ONCE 0x1 112#define QFLAG_APPLY_ONCE 0x1
99#define QFLAG_APPLIED 0x2 113#define QFLAG_APPLIED 0x2
100#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +128,10 @@ static struct chipset early_qrk[] __initdata = {
114 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
115 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 129 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
116 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 130 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
131#ifdef CONFIG_DMAR
132 { PCI_VENDOR_ID_INTEL, 0x29c0,
133 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
134#endif
117 {} 135 {}
118}; 136};
119 137
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..945a31cdd81f 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 414 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 415 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
417
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 418 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 419 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 420 "Kernel-defined memdesc doesn't match the one from EFI!\n");
421
420 if (add_efi_memmap) 422 if (add_efi_memmap)
421 do_add_efi_memmap(); 423 do_add_efi_memmap();
422 424
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 89434d439605..cf3a0b2d0059 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64)
275ENTRY(ret_from_fork) 275ENTRY(ret_from_fork)
276 CFI_DEFAULT_STACK 276 CFI_DEFAULT_STACK
277 push kernel_eflags(%rip) 277 push kernel_eflags(%rip)
278 CFI_ADJUST_CFA_OFFSET 4 278 CFI_ADJUST_CFA_OFFSET 8
279 popf # reset kernel eflags 279 popf # reset kernel eflags
280 CFI_ADJUST_CFA_OFFSET -4 280 CFI_ADJUST_CFA_OFFSET -8
281 call schedule_tail 281 call schedule_tail
282 GET_THREAD_INFO(%rcx) 282 GET_THREAD_INFO(%rcx)
283 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 283 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
new file mode 100644
index 000000000000..849e5cd485b8
--- /dev/null
+++ b/arch/x86/kernel/es7000_32.c
@@ -0,0 +1,345 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27#include <linux/module.h>
28#include <linux/types.h>
29#include <linux/kernel.h>
30#include <linux/smp.h>
31#include <linux/string.h>
32#include <linux/spinlock.h>
33#include <linux/errno.h>
34#include <linux/notifier.h>
35#include <linux/reboot.h>
36#include <linux/init.h>
37#include <linux/acpi.h>
38#include <asm/io.h>
39#include <asm/nmi.h>
40#include <asm/smp.h>
41#include <asm/apicdef.h>
42#include <mach_mpparse.h>
43
44/*
45 * ES7000 chipsets
46 */
47
48#define NON_UNISYS 0
49#define ES7000_CLASSIC 1
50#define ES7000_ZORRO 2
51
52
53#define MIP_REG 1
54#define MIP_PSAI_REG 4
55
56#define MIP_BUSY 1
57#define MIP_SPIN 0xf0000
58#define MIP_VALID 0x0100000000000000ULL
59#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
60
61#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
62
63struct mip_reg_info {
64 unsigned long long mip_info;
65 unsigned long long delivery_info;
66 unsigned long long host_reg;
67 unsigned long long mip_reg;
68};
69
70struct part_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char part_id;
74 unsigned char apic_mode;
75 unsigned long snum;
76 char ptype[16];
77 char sname[64];
78 char pname[64];
79};
80
81struct psai {
82 unsigned long long entry_type;
83 unsigned long long addr;
84 unsigned long long bep_addr;
85};
86
87struct es7000_mem_info {
88 unsigned char type;
89 unsigned char length;
90 unsigned char resv[6];
91 unsigned long long start;
92 unsigned long long size;
93};
94
95struct es7000_oem_table {
96 unsigned long long hdr;
97 struct mip_reg_info mip;
98 struct part_info pif;
99 struct es7000_mem_info shm;
100 struct psai psai;
101};
102
103#ifdef CONFIG_ACPI
104
105struct oem_table {
106 struct acpi_table_header Header;
107 u32 OEMTableAddr;
108 u32 OEMTableSize;
109};
110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112#endif
113
114struct mip_reg {
115 unsigned long long off_0;
116 unsigned long long off_8;
117 unsigned long long off_10;
118 unsigned long long off_18;
119 unsigned long long off_20;
120 unsigned long long off_28;
121 unsigned long long off_30;
122 unsigned long long off_38;
123};
124
125#define MIP_SW_APIC 0x1020b
126#define MIP_FUNC(VALUE) (VALUE & 0xff)
127
128/*
129 * ES7000 Globals
130 */
131
132static volatile unsigned long *psai = NULL;
133static struct mip_reg *mip_reg;
134static struct mip_reg *host_reg;
135static int mip_port;
136static unsigned long mip_addr, host_addr;
137
138int es7000_plat;
139
140/*
141 * GSI override for ES7000 platforms.
142 */
143
144static unsigned int base;
145
146static int
147es7000_rename_gsi(int ioapic, int gsi)
148{
149 if (es7000_plat == ES7000_ZORRO)
150 return gsi;
151
152 if (!base) {
153 int i;
154 for (i = 0; i < nr_ioapics; i++)
155 base += nr_ioapic_registers[i];
156 }
157
158 if (!ioapic && (gsi < 16))
159 gsi += base;
160 return gsi;
161}
162
163void __init
164setup_unisys(void)
165{
166 /*
167 * Determine the generation of the ES7000 currently running.
168 *
169 * es7000_plat = 1 if the machine is a 5xx ES7000 box
170 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
171 *
172 */
173 if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
174 es7000_plat = ES7000_ZORRO;
175 else
176 es7000_plat = ES7000_CLASSIC;
177 ioapic_renumber_irq = es7000_rename_gsi;
178}
179
180/*
181 * Parse the OEM Table
182 */
183
184int __init
185parse_unisys_oem (char *oemptr)
186{
187 int i;
188 int success = 0;
189 unsigned char type, size;
190 unsigned long val;
191 char *tp = NULL;
192 struct psai *psaip = NULL;
193 struct mip_reg_info *mi;
194 struct mip_reg *host, *mip;
195
196 tp = oemptr;
197
198 tp += 8;
199
200 for (i=0; i <= 6; i++) {
201 type = *tp++;
202 size = *tp++;
203 tp -= 2;
204 switch (type) {
205 case MIP_REG:
206 mi = (struct mip_reg_info *)tp;
207 val = MIP_RD_LO(mi->host_reg);
208 host_addr = val;
209 host = (struct mip_reg *)val;
210 host_reg = __va(host);
211 val = MIP_RD_LO(mi->mip_reg);
212 mip_port = MIP_PORT(mi->mip_info);
213 mip_addr = val;
214 mip = (struct mip_reg *)val;
215 mip_reg = __va(mip);
216 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
217 (unsigned long)host_reg);
218 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
219 (unsigned long)mip_reg);
220 success++;
221 break;
222 case MIP_PSAI_REG:
223 psaip = (struct psai *)tp;
224 if (tp != NULL) {
225 if (psaip->addr)
226 psai = __va(psaip->addr);
227 else
228 psai = NULL;
229 success++;
230 }
231 break;
232 default:
233 break;
234 }
235 tp += size;
236 }
237
238 if (success < 2) {
239 es7000_plat = NON_UNISYS;
240 } else
241 setup_unisys();
242 return es7000_plat;
243}
244
245#ifdef CONFIG_ACPI
246int __init
247find_unisys_acpi_oem_table(unsigned long *oem_addr)
248{
249 struct acpi_table_header *header = NULL;
250 int i = 0;
251 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) {
252 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
253 struct oem_table *t = (struct oem_table *)header;
254 *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr,
255 t->OEMTableSize);
256 return 0;
257 }
258 }
259 return -1;
260}
261#endif
262
263static void
264es7000_spin(int n)
265{
266 int i = 0;
267
268 while (i++ < n)
269 rep_nop();
270}
271
272static int __init
273es7000_mip_write(struct mip_reg *mip_reg)
274{
275 int status = 0;
276 int spin;
277
278 spin = MIP_SPIN;
279 while (((unsigned long long)host_reg->off_38 &
280 (unsigned long long)MIP_VALID) != 0) {
281 if (--spin <= 0) {
282 printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
283 return -1;
284 }
285 es7000_spin(MIP_SPIN);
286 }
287
288 memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
289 outb(1, mip_port);
290
291 spin = MIP_SPIN;
292
293 while (((unsigned long long)mip_reg->off_38 &
294 (unsigned long long)MIP_VALID) == 0) {
295 if (--spin <= 0) {
296 printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
297 return -1;
298 }
299 es7000_spin(MIP_SPIN);
300 }
301
302 status = ((unsigned long long)mip_reg->off_0 &
303 (unsigned long long)0xffff0000000000ULL) >> 48;
304 mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
305 (unsigned long long)~MIP_VALID);
306 return status;
307}
308
309int
310es7000_start_cpu(int cpu, unsigned long eip)
311{
312 unsigned long vect = 0, psaival = 0;
313
314 if (psai == NULL)
315 return -1;
316
317 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
318 psaival = (0x1000000 | vect | cpu);
319
320 while (*psai & 0x1000000)
321 ;
322
323 *psai = psaival;
324
325 return 0;
326
327}
328
329void __init
330es7000_sw_apic(void)
331{
332 if (es7000_plat) {
333 int mip_status;
334 struct mip_reg es7000_mip_reg;
335
336 printk("ES7000: Enabling APIC mode.\n");
337 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
338 es7000_mip_reg.off_0 = MIP_SW_APIC;
339 es7000_mip_reg.off_38 = (MIP_VALID);
340 while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
341 printk("es7000_sw_apic: command failed, status = %x\n",
342 mip_status);
343 return;
344 }
345}
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index fed9f68efd66..e4bf2cc0d743 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -120,14 +120,9 @@ static unsigned long set_apic_id(unsigned int id)
120 return x; 120 return x;
121} 121}
122 122
123static unsigned int x2apic_read_id(void)
124{
125 return apic_read(APIC_ID);
126}
127
128static unsigned int phys_pkg_id(int index_msb) 123static unsigned int phys_pkg_id(int index_msb)
129{ 124{
130 return x2apic_read_id() >> index_msb; 125 return current_cpu_data.initial_apicid >> index_msb;
131} 126}
132 127
133static void x2apic_send_IPI_self(int vector) 128static void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index 958d537b4cc9..8f1343df2627 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -118,14 +118,9 @@ static unsigned long set_apic_id(unsigned int id)
118 return x; 118 return x;
119} 119}
120 120
121static unsigned int x2apic_read_id(void)
122{
123 return apic_read(APIC_ID);
124}
125
126static unsigned int phys_pkg_id(int index_msb) 121static unsigned int phys_pkg_id(int index_msb)
127{ 122{
128 return x2apic_read_id() >> index_msb; 123 return current_cpu_data.initial_apicid >> index_msb;
129} 124}
130 125
131void x2apic_send_IPI_self(int vector) 126void x2apic_send_IPI_self(int vector)
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 99269beabc7c..ae2ffc8a400c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -54,6 +54,7 @@ int is_uv_system(void)
54{ 54{
55 return uv_system_type != UV_NONE; 55 return uv_system_type != UV_NONE;
56} 56}
57EXPORT_SYMBOL_GPL(is_uv_system);
57 58
58DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 59DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
59EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 60EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9bfc4d72fb2e..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
108 } 108 }
109 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
110 110
111 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
112 early_printk("Kernel alive\n");
112 113
113 x86_64_init_pda(); 114 x86_64_init_pda();
114 115
115 early_printk("Kernel really alive\n");
116
117 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
118} 117}
119 118
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377a..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
634 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
635 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
636ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
638# if KPMDS == 3 634# if KPMDS == 3
639 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
640 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
641 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
642# elif KPMDS == 2 638# elif KPMDS == 2
643 .long 0,0 639 .long 0,0
644 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
645 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
646# elif KPMDS == 1 642# elif KPMDS == 1
647 .long 0,0 643 .long 0,0
648 .long 0,0 644 .long 0,0
649 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
650# else 646# else
651# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
652# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe886..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 26ea3ea3fb30..e710289f673e 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -51,6 +51,8 @@
51#include <mach_apic.h> 51#include <mach_apic.h>
52#include <mach_apicdef.h> 52#include <mach_apicdef.h>
53 53
54#define __apicdebuginit(type) static type __init
55
54int (*ioapic_renumber_irq)(int ioapic, int irq); 56int (*ioapic_renumber_irq)(int ioapic, int irq);
55atomic_t irq_mis_count; 57atomic_t irq_mis_count;
56 58
@@ -1342,7 +1344,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1342 ioapic_write_entry(apic, pin, entry); 1344 ioapic_write_entry(apic, pin, entry);
1343} 1345}
1344 1346
1345void __init print_IO_APIC(void) 1347
1348__apicdebuginit(void) print_IO_APIC(void)
1346{ 1349{
1347 int apic, i; 1350 int apic, i;
1348 union IO_APIC_reg_00 reg_00; 1351 union IO_APIC_reg_00 reg_00;
@@ -1457,9 +1460,7 @@ void __init print_IO_APIC(void)
1457 return; 1460 return;
1458} 1461}
1459 1462
1460#if 0 1463__apicdebuginit(void) print_APIC_bitfield(int base)
1461
1462static void print_APIC_bitfield(int base)
1463{ 1464{
1464 unsigned int v; 1465 unsigned int v;
1465 int i, j; 1466 int i, j;
@@ -1480,9 +1481,10 @@ static void print_APIC_bitfield(int base)
1480 } 1481 }
1481} 1482}
1482 1483
1483void /*__init*/ print_local_APIC(void *dummy) 1484__apicdebuginit(void) print_local_APIC(void *dummy)
1484{ 1485{
1485 unsigned int v, ver, maxlvt; 1486 unsigned int v, ver, maxlvt;
1487 u64 icr;
1486 1488
1487 if (apic_verbosity == APIC_QUIET) 1489 if (apic_verbosity == APIC_QUIET)
1488 return; 1490 return;
@@ -1533,10 +1535,9 @@ void /*__init*/ print_local_APIC(void *dummy)
1533 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1535 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1534 } 1536 }
1535 1537
1536 v = apic_read(APIC_ICR); 1538 icr = apic_icr_read();
1537 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1539 printk(KERN_DEBUG "... APIC ICR: %08x\n", icr);
1538 v = apic_read(APIC_ICR2); 1540 printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32);
1539 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1540 1541
1541 v = apic_read(APIC_LVTT); 1542 v = apic_read(APIC_LVTT);
1542 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1543 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1564,12 +1565,12 @@ void /*__init*/ print_local_APIC(void *dummy)
1564 printk("\n"); 1565 printk("\n");
1565} 1566}
1566 1567
1567void print_all_local_APICs(void) 1568__apicdebuginit(void) print_all_local_APICs(void)
1568{ 1569{
1569 on_each_cpu(print_local_APIC, NULL, 1); 1570 on_each_cpu(print_local_APIC, NULL, 1);
1570} 1571}
1571 1572
1572void /*__init*/ print_PIC(void) 1573__apicdebuginit(void) print_PIC(void)
1573{ 1574{
1574 unsigned int v; 1575 unsigned int v;
1575 unsigned long flags; 1576 unsigned long flags;
@@ -1601,7 +1602,17 @@ void /*__init*/ print_PIC(void)
1601 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1602 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1602} 1603}
1603 1604
1604#endif /* 0 */ 1605__apicdebuginit(int) print_all_ICs(void)
1606{
1607 print_PIC();
1608 print_all_local_APICs();
1609 print_IO_APIC();
1610
1611 return 0;
1612}
1613
1614fs_initcall(print_all_ICs);
1615
1605 1616
1606static void __init enable_IO_APIC(void) 1617static void __init enable_IO_APIC(void)
1607{ 1618{
@@ -2327,8 +2338,6 @@ void __init setup_IO_APIC(void)
2327 setup_IO_APIC_irqs(); 2338 setup_IO_APIC_irqs();
2328 init_IO_APIC_traps(); 2339 init_IO_APIC_traps();
2329 check_timer(); 2340 check_timer();
2330 if (!acpi_ioapic)
2331 print_IO_APIC();
2332} 2341}
2333 2342
2334/* 2343/*
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index e63282e78864..a1bec2969c6a 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -55,6 +55,8 @@
55#include <mach_ipi.h> 55#include <mach_ipi.h>
56#include <mach_apic.h> 56#include <mach_apic.h>
57 57
58#define __apicdebuginit(type) static type __init
59
58struct irq_cfg { 60struct irq_cfg {
59 cpumask_t domain; 61 cpumask_t domain;
60 cpumask_t old_domain; 62 cpumask_t old_domain;
@@ -89,8 +91,6 @@ int first_system_vector = 0xfe;
89 91
90char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; 92char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
91 93
92#define __apicdebuginit __init
93
94int sis_apic_bug; /* not actually supported, dummy for compile */ 94int sis_apic_bug; /* not actually supported, dummy for compile */
95 95
96static int no_timer_check; 96static int no_timer_check;
@@ -1117,7 +1117,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1117 ioapic_write_entry(apic, pin, entry); 1117 ioapic_write_entry(apic, pin, entry);
1118} 1118}
1119 1119
1120void __apicdebuginit print_IO_APIC(void) 1120
1121__apicdebuginit(void) print_IO_APIC(void)
1121{ 1122{
1122 int apic, i; 1123 int apic, i;
1123 union IO_APIC_reg_00 reg_00; 1124 union IO_APIC_reg_00 reg_00;
@@ -1211,9 +1212,7 @@ void __apicdebuginit print_IO_APIC(void)
1211 return; 1212 return;
1212} 1213}
1213 1214
1214#if 0 1215__apicdebuginit(void) print_APIC_bitfield(int base)
1215
1216static __apicdebuginit void print_APIC_bitfield (int base)
1217{ 1216{
1218 unsigned int v; 1217 unsigned int v;
1219 int i, j; 1218 int i, j;
@@ -1234,7 +1233,7 @@ static __apicdebuginit void print_APIC_bitfield (int base)
1234 } 1233 }
1235} 1234}
1236 1235
1237void __apicdebuginit print_local_APIC(void * dummy) 1236__apicdebuginit(void) print_local_APIC(void *dummy)
1238{ 1237{
1239 unsigned int v, ver, maxlvt; 1238 unsigned int v, ver, maxlvt;
1240 unsigned long icr; 1239 unsigned long icr;
@@ -1311,12 +1310,12 @@ void __apicdebuginit print_local_APIC(void * dummy)
1311 printk("\n"); 1310 printk("\n");
1312} 1311}
1313 1312
1314void print_all_local_APICs (void) 1313__apicdebuginit(void) print_all_local_APICs(void)
1315{ 1314{
1316 on_each_cpu(print_local_APIC, NULL, 1); 1315 on_each_cpu(print_local_APIC, NULL, 1);
1317} 1316}
1318 1317
1319void __apicdebuginit print_PIC(void) 1318__apicdebuginit(void) print_PIC(void)
1320{ 1319{
1321 unsigned int v; 1320 unsigned int v;
1322 unsigned long flags; 1321 unsigned long flags;
@@ -1348,7 +1347,17 @@ void __apicdebuginit print_PIC(void)
1348 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1347 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1349} 1348}
1350 1349
1351#endif /* 0 */ 1350__apicdebuginit(int) print_all_ICs(void)
1351{
1352 print_PIC();
1353 print_all_local_APICs();
1354 print_IO_APIC();
1355
1356 return 0;
1357}
1358
1359fs_initcall(print_all_ICs);
1360
1352 1361
1353void __init enable_IO_APIC(void) 1362void __init enable_IO_APIC(void)
1354{ 1363{
@@ -2172,8 +2181,6 @@ void __init setup_IO_APIC(void)
2172 setup_IO_APIC_irqs(); 2181 setup_IO_APIC_irqs();
2173 init_IO_APIC_traps(); 2182 init_IO_APIC_traps();
2174 check_timer(); 2183 check_timer();
2175 if (!acpi_ioapic)
2176 print_IO_APIC();
2177} 2184}
2178 2185
2179struct sysfs_ioapic_data { 2186struct sysfs_ioapic_data {
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee1..9200a1e2752d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -74,6 +74,15 @@ void __init init_ISA_irqs (void)
74 } 74 }
75} 75}
76 76
77/*
78 * IRQ2 is cascade interrupt to second interrupt controller
79 */
80static struct irqaction irq2 = {
81 .handler = no_action,
82 .mask = CPU_MASK_NONE,
83 .name = "cascade",
84};
85
77/* Overridden in paravirt.c */ 86/* Overridden in paravirt.c */
78void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 87void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
79 88
@@ -98,6 +107,46 @@ void __init native_init_IRQ(void)
98 set_intr_gate(vector, interrupt[i]); 107 set_intr_gate(vector, interrupt[i]);
99 } 108 }
100 109
110#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
111 /*
112 * IRQ0 must be given a fixed assignment and initialized,
113 * because it's used before the IO-APIC is set up.
114 */
115 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
116
117 /*
118 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
119 * IPI, driven by wakeup.
120 */
121 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
122
123 /* IPI for invalidation */
124 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
125
126 /* IPI for generic function call */
127 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
128
129 /* IPI for single call function */
130 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
131#endif
132
133#ifdef CONFIG_X86_LOCAL_APIC
134 /* self generated IPI for local APIC timer */
135 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
136
137 /* IPI vectors for APIC spurious and error interrupts */
138 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
139 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
140#endif
141
142#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
143 /* thermal monitor LVT interrupt */
144 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
145#endif
146
147 if (!acpi_ioapic)
148 setup_irq(2, &irq2);
149
101 /* setup after call gates are initialised (usually add in 150 /* setup after call gates are initialised (usually add in
102 * the architecture specific gates) 151 * the architecture specific gates)
103 */ 152 */
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2cc4ad..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
299 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
300} 300}
301 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
302void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
303{ 312{
304 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
311 320
312 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
313 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
314 /* enable it before to avoid race with handler */
315 __get_cpu_var(wd_enabled) = 1;
316 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
317 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
318 return; 325 return;
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 4090cd6f8436..6b0bb73998dd 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -330,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = {
330#endif 330#endif
331 .wbinvd = native_wbinvd, 331 .wbinvd = native_wbinvd,
332 .read_msr = native_read_msr_safe, 332 .read_msr = native_read_msr_safe,
333 .read_msr_amd = native_read_msr_amd_safe,
333 .write_msr = native_write_msr_safe, 334 .write_msr = native_write_msr_safe,
334 .read_tsc = native_read_tsc, 335 .read_tsc = native_read_tsc,
335 .read_pmc = native_read_pmc, 336 .read_pmc = native_read_pmc,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index dcdac6c826e9..080d1d27f37a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -261,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
261 badbit, tbl, start_addr, npages); 261 badbit, tbl, start_addr, npages);
262 } 262 }
263 263
264 set_bit_string(tbl->it_map, index, npages); 264 iommu_area_reserve(tbl->it_map, index, npages);
265 265
266 spin_unlock_irqrestore(&tbl->it_lock, flags); 266 spin_unlock_irqrestore(&tbl->it_lock, flags);
267} 267}
@@ -491,6 +491,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
491 npages = size >> PAGE_SHIFT; 491 npages = size >> PAGE_SHIFT;
492 order = get_order(size); 492 order = get_order(size);
493 493
494 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
495
494 /* alloc enough pages (and possibly more) */ 496 /* alloc enough pages (and possibly more) */
495 ret = (void *)__get_free_pages(flag, order); 497 ret = (void *)__get_free_pages(flag, order);
496 if (!ret) 498 if (!ret)
@@ -510,8 +512,22 @@ error:
510 return ret; 512 return ret;
511} 513}
512 514
515static void calgary_free_coherent(struct device *dev, size_t size,
516 void *vaddr, dma_addr_t dma_handle)
517{
518 unsigned int npages;
519 struct iommu_table *tbl = find_iommu_table(dev);
520
521 size = PAGE_ALIGN(size);
522 npages = size >> PAGE_SHIFT;
523
524 iommu_free(tbl, dma_handle, npages);
525 free_pages((unsigned long)vaddr, get_order(size));
526}
527
513static struct dma_mapping_ops calgary_dma_ops = { 528static struct dma_mapping_ops calgary_dma_ops = {
514 .alloc_coherent = calgary_alloc_coherent, 529 .alloc_coherent = calgary_alloc_coherent,
530 .free_coherent = calgary_free_coherent,
515 .map_single = calgary_map_single, 531 .map_single = calgary_map_single,
516 .unmap_single = calgary_unmap_single, 532 .unmap_single = calgary_unmap_single,
517 .map_sg = calgary_map_sg, 533 .map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f704cb51ff82..0a3824e837b4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address);
41/* Dummy device used for NULL arguments (normally ISA). Better would 41/* Dummy device used for NULL arguments (normally ISA). Better would
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 42 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 43 to older i386. */
44struct device fallback_dev = { 44struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 45 .bus_id = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 46 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &fallback_dev.coherent_dma_mask, 47 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 48};
49EXPORT_SYMBOL(x86_dma_fallback_dev);
49 50
50int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
51{ 52{
@@ -133,6 +134,37 @@ unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
133EXPORT_SYMBOL(iommu_num_pages); 134EXPORT_SYMBOL(iommu_num_pages);
134#endif 135#endif
135 136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag)
139{
140 unsigned long dma_mask;
141 struct page *page;
142 dma_addr_t addr;
143
144 dma_mask = dma_alloc_coherent_mask(dev, flag);
145
146 flag |= __GFP_ZERO;
147again:
148 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
149 if (!page)
150 return NULL;
151
152 addr = page_to_phys(page);
153 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
154 __free_pages(page, get_order(size));
155
156 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
157 flag = (flag & ~GFP_DMA32) | GFP_DMA;
158 goto again;
159 }
160
161 return NULL;
162 }
163
164 *dma_addr = addr;
165 return page_address(page);
166}
167
136/* 168/*
137 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 169 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
138 * documentation. 170 * documentation.
@@ -241,147 +273,6 @@ int dma_supported(struct device *dev, u64 mask)
241} 273}
242EXPORT_SYMBOL(dma_supported); 274EXPORT_SYMBOL(dma_supported);
243 275
244/* Allocate DMA memory on node near device */
245static noinline struct page *
246dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
247{
248 int node;
249
250 node = dev_to_node(dev);
251
252 return alloc_pages_node(node, gfp, order);
253}
254
255/*
256 * Allocate memory for a coherent mapping.
257 */
258void *
259dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
260 gfp_t gfp)
261{
262 struct dma_mapping_ops *ops = get_dma_ops(dev);
263 void *memory = NULL;
264 struct page *page;
265 unsigned long dma_mask = 0;
266 dma_addr_t bus;
267 int noretry = 0;
268
269 /* ignore region specifiers */
270 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
271
272 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
273 return memory;
274
275 if (!dev) {
276 dev = &fallback_dev;
277 gfp |= GFP_DMA;
278 }
279 dma_mask = dev->coherent_dma_mask;
280 if (dma_mask == 0)
281 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
282
283 /* Device not DMA able */
284 if (dev->dma_mask == NULL)
285 return NULL;
286
287 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
288 if (gfp & __GFP_DMA)
289 noretry = 1;
290
291#ifdef CONFIG_X86_64
292 /* Why <=? Even when the mask is smaller than 4GB it is often
293 larger than 16MB and in this case we have a chance of
294 finding fitting memory in the next higher zone first. If
295 not retry with true GFP_DMA. -AK */
296 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
297 gfp |= GFP_DMA32;
298 if (dma_mask < DMA_32BIT_MASK)
299 noretry = 1;
300 }
301#endif
302
303 again:
304 page = dma_alloc_pages(dev,
305 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
306 if (page == NULL)
307 return NULL;
308
309 {
310 int high, mmu;
311 bus = page_to_phys(page);
312 memory = page_address(page);
313 high = (bus + size) >= dma_mask;
314 mmu = high;
315 if (force_iommu && !(gfp & GFP_DMA))
316 mmu = 1;
317 else if (high) {
318 free_pages((unsigned long)memory,
319 get_order(size));
320
321 /* Don't use the 16MB ZONE_DMA unless absolutely
322 needed. It's better to use remapping first. */
323 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
324 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
325 goto again;
326 }
327
328 /* Let low level make its own zone decisions */
329 gfp &= ~(GFP_DMA32|GFP_DMA);
330
331 if (ops->alloc_coherent)
332 return ops->alloc_coherent(dev, size,
333 dma_handle, gfp);
334 return NULL;
335 }
336
337 memset(memory, 0, size);
338 if (!mmu) {
339 *dma_handle = bus;
340 return memory;
341 }
342 }
343
344 if (ops->alloc_coherent) {
345 free_pages((unsigned long)memory, get_order(size));
346 gfp &= ~(GFP_DMA|GFP_DMA32);
347 return ops->alloc_coherent(dev, size, dma_handle, gfp);
348 }
349
350 if (ops->map_simple) {
351 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
352 size,
353 PCI_DMA_BIDIRECTIONAL);
354 if (*dma_handle != bad_dma_address)
355 return memory;
356 }
357
358 if (panic_on_overflow)
359 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
360 (unsigned long)size);
361 free_pages((unsigned long)memory, get_order(size));
362 return NULL;
363}
364EXPORT_SYMBOL(dma_alloc_coherent);
365
366/*
367 * Unmap coherent memory.
368 * The caller must ensure that the device has finished accessing the mapping.
369 */
370void dma_free_coherent(struct device *dev, size_t size,
371 void *vaddr, dma_addr_t bus)
372{
373 struct dma_mapping_ops *ops = get_dma_ops(dev);
374
375 int order = get_order(size);
376 WARN_ON(irqs_disabled()); /* for portability */
377 if (dma_release_from_coherent(dev, order, vaddr))
378 return;
379 if (ops->unmap_single)
380 ops->unmap_single(dev, bus, size, 0);
381 free_pages((unsigned long)vaddr, order);
382}
383EXPORT_SYMBOL(dma_free_coherent);
384
385static int __init pci_iommu_init(void) 276static int __init pci_iommu_init(void)
386{ 277{
387 calgary_iommu_init(); 278 calgary_iommu_init();
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 49285f8fd4d5..145f1c83369f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
80AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
81 81
82static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
83static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
84 84
85static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
86{ 87{
87 unsigned long offset, flags; 88 unsigned long offset, flags;
88 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
90 91
91 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
92 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
93 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
94 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
95 96
96 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
97 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
98 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
99 if (offset == -1) { 100 if (offset == -1) {
100 need_flush = 1; 101 need_flush = true;
101 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
102 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
103 } 105 }
104 if (offset != -1) { 106 if (offset != -1) {
105 next_bit = offset+size; 107 next_bit = offset+size;
106 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
107 next_bit = 0; 109 next_bit = 0;
108 need_flush = 1; 110 need_flush = true;
109 } 111 }
110 } 112 }
111 if (iommu_fullflush) 113 if (iommu_fullflush)
112 need_flush = 1; 114 need_flush = true;
113 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
114 116
115 return offset; 117 return offset;
@@ -134,7 +136,7 @@ static void flush_gart(void)
134 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
135 if (need_flush) { 137 if (need_flush) {
136 k8_flush_garts(); 138 k8_flush_garts();
137 need_flush = 0; 139 need_flush = false;
138 } 140 }
139 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
140} 142}
@@ -173,7 +175,8 @@ static void dump_leak(void)
173 iommu_leak_pages); 175 iommu_leak_pages);
174 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
175 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
176 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
177 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
178 } 181 }
179 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
212static inline int 215static inline int
213need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
214{ 217{
215 u64 mask = *dev->dma_mask; 218 return force_iommu ||
216 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
217 int mmu = high;
218
219 if (force_iommu)
220 mmu = 1;
221
222 return mmu;
223} 220}
224 221
225static inline int 222static inline int
226nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
227{ 224{
228 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
229 int high = addr + size > mask;
230 int mmu = high;
231
232 return mmu;
233} 226}
234 227
235/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
236 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
237 */ 230 */
238static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
239 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
240{ 233{
241 unsigned long npages = iommu_num_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size);
242 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
243 int i; 236 int i;
244 237
245 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
259 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
260} 253}
261 254
262static dma_addr_t
263gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
264{
265 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
266
267 flush_gart();
268
269 return map;
270}
271
272/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
273static dma_addr_t 256static dma_addr_t
274gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
276 unsigned long bus; 259 unsigned long bus;
277 260
278 if (!dev) 261 if (!dev)
279 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
280 263
281 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
282 return paddr; 265 return paddr;
283 266
284 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
285 269
286 return bus; 270 return bus;
287} 271}
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
340 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
341 325
342 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
343 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
344 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
345 if (i > 0) 329 if (i > 0)
346 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
362 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
363 unsigned long pages) 347 unsigned long pages)
364{ 348{
365 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
366 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
367 struct scatterlist *s; 351 struct scatterlist *s;
368 int i; 352 int i;
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
427 return 0; 411 return 0;
428 412
429 if (!dev) 413 if (!dev)
430 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
431 415
432 out = 0; 416 out = 0;
433 start = 0; 417 start = 0;
@@ -499,6 +483,46 @@ error:
499 return 0; 483 return 0;
500} 484}
501 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
502static int no_agp; 526static int no_agp;
503 527
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
626 struct pci_dev *dev; 650 struct pci_dev *dev;
627 void *gatt; 651 void *gatt;
628 int i, error; 652 int i, error;
629 unsigned long start_pfn, end_pfn;
630 653
631 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
632 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
650 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
651 674
652 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
653 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
654 if (!gatt) 678 if (!gatt)
655 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
656 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
657 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
658 682
659 memset(gatt, 0, gatt_size);
660 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
661 684
662 enable_gart_translations(); 685 enable_gart_translations();
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
665 if (!error) 688 if (!error)
666 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
667 if (error) 690 if (error)
668 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
669 693
670 flush_gart(); 694 flush_gart();
671 695
672 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
673 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
674 698
675 /* need to map that range */
676 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
677 if (end_pfn > max_low_pfn_mapped) {
678 start_pfn = (aper_base>>PAGE_SHIFT);
679 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
680 }
681 return 0; 699 return 0;
682 700
683 nommu: 701 nommu:
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
687 return -1; 705 return -1;
688} 706}
689 707
690extern int agp_amd64_init(void);
691
692static struct dma_mapping_ops gart_dma_ops = { 708static struct dma_mapping_ops gart_dma_ops = {
693 .map_single = gart_map_single, 709 .map_single = gart_map_single,
694 .map_simple = gart_map_simple,
695 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
696 .sync_single_for_cpu = NULL,
697 .sync_single_for_device = NULL,
698 .sync_single_range_for_cpu = NULL,
699 .sync_single_range_for_device = NULL,
700 .sync_sg_for_cpu = NULL,
701 .sync_sg_for_device = NULL,
702 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
703 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
704}; 715};
705 716
706void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void)
727{ 738{
728 struct agp_kern_info info; 739 struct agp_kern_info info;
729 unsigned long iommu_start; 740 unsigned long iommu_start;
730 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
731 unsigned long scratch; 743 unsigned long scratch;
732 long i; 744 long i;
733 745
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void)
759 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
760 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
761 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
762 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
763 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
764 } 776 }
765 return; 777 return;
766 } 778 }
767 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
768 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
769 aper_size = info.aper_size * 1024 * 1024;
770 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
771 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
772 792
773 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
774 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
775 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
776 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
777 memset(iommu_gart_bitmap, 0, iommu_pages/8);
778 797
779#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
780 if (leak_trace) { 799 if (leak_trace) {
781 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
782 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
783 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
784 memset(iommu_leak_tab, 0, iommu_pages * 8);
785 else
786 printk(KERN_DEBUG 803 printk(KERN_DEBUG
787 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
788 } 805 }
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void)
792 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
793 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
794 */ 811 */
795 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
796 813
797 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
798 printk(KERN_INFO 815 printk(KERN_INFO
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p)
850 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
851 leak_trace = 1; 868 leak_trace = 1;
852 p += 4; 869 p += 4;
853 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
854 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
855 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
856 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3f91f71cdc3e..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76 dma_addr_t dma_addr)
77{
78 free_pages((unsigned long)vaddr, get_order(size));
79}
80
75struct dma_mapping_ops nommu_dma_ops = { 81struct dma_mapping_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
76 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
77 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
78 .is_phys = 1, 86 .is_phys = 1,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc4d5b0a6a0..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
19 18
20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
21{ 20{
@@ -185,7 +184,8 @@ static void mwait_idle(void)
185static void poll_idle(void) 184static void poll_idle(void)
186{ 185{
187 local_irq_enable(); 186 local_irq_enable();
188 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
189} 189}
190 190
191/* 191/*
@@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
246 return 1; 246 return 1;
247} 247}
248 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
249/* 257/*
250 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
251 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
253 */ 261 */
254static void c1e_idle(void) 262static void c1e_idle(void)
255{ 263{
256 static cpumask_t c1e_mask = CPU_MASK_NONE;
257 static int c1e_detected;
258
259 if (need_resched()) 264 if (need_resched())
260 return; 265 return;
261 266
@@ -265,8 +270,10 @@ static void c1e_idle(void)
265 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
266 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
267 c1e_detected = 1; 272 c1e_detected = 1;
268 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
269 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
270 } 277 }
271 } 278 }
272 279
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2c9abc95e026..205188db9626 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,7 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
58#include <asm/syscalls.h> 60#include <asm/syscalls.h>
59#include <asm/smp.h> 61#include <asm/smp.h>
60 62
@@ -90,6 +92,7 @@ static void cpu_exit_clear(void)
90 cpu_clear(cpu, cpu_callin_map); 92 cpu_clear(cpu, cpu_callin_map);
91 93
92 numa_remove_cpu(cpu); 94 numa_remove_cpu(cpu);
95 c1e_remove_cpu(cpu);
93} 96}
94 97
95/* We don't actually take CPU down, just spin without interrupts. */ 98/* We don't actually take CPU down, just spin without interrupts. */
@@ -161,6 +164,7 @@ void __show_registers(struct pt_regs *regs, int all)
161 unsigned long d0, d1, d2, d3, d6, d7; 164 unsigned long d0, d1, d2, d3, d6, d7;
162 unsigned long sp; 165 unsigned long sp;
163 unsigned short ss, gs; 166 unsigned short ss, gs;
167 const char *board;
164 168
165 if (user_mode_vm(regs)) { 169 if (user_mode_vm(regs)) {
166 sp = regs->sp; 170 sp = regs->sp;
@@ -173,11 +177,15 @@ void __show_registers(struct pt_regs *regs, int all)
173 } 177 }
174 178
175 printk("\n"); 179 printk("\n");
176 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 180
181 board = dmi_get_system_info(DMI_PRODUCT_NAME);
182 if (!board)
183 board = "";
184 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
177 task_pid_nr(current), current->comm, 185 task_pid_nr(current), current->comm,
178 print_tainted(), init_utsname()->release, 186 print_tainted(), init_utsname()->release,
179 (int)strcspn(init_utsname()->version, " "), 187 (int)strcspn(init_utsname()->version, " "),
180 init_utsname()->version); 188 init_utsname()->version, board);
181 189
182 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 190 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
183 (u16)regs->cs, regs->ip, regs->flags, 191 (u16)regs->cs, regs->ip, regs->flags,
@@ -277,6 +285,14 @@ void exit_thread(void)
277 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 285 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
278 put_cpu(); 286 put_cpu();
279 } 287 }
288#ifdef CONFIG_X86_DS
289 /* Free any DS contexts that have not been properly released. */
290 if (unlikely(current->thread.ds_ctx)) {
291 /* we clear debugctl to make sure DS is not used. */
292 update_debugctlmsr(0);
293 ds_free(current->thread.ds_ctx);
294 }
295#endif /* CONFIG_X86_DS */
280} 296}
281 297
282void flush_thread(void) 298void flush_thread(void)
@@ -438,6 +454,35 @@ int set_tsc_mode(unsigned int val)
438 return 0; 454 return 0;
439} 455}
440 456
457#ifdef CONFIG_X86_DS
458static int update_debugctl(struct thread_struct *prev,
459 struct thread_struct *next, unsigned long debugctl)
460{
461 unsigned long ds_prev = 0;
462 unsigned long ds_next = 0;
463
464 if (prev->ds_ctx)
465 ds_prev = (unsigned long)prev->ds_ctx->ds;
466 if (next->ds_ctx)
467 ds_next = (unsigned long)next->ds_ctx->ds;
468
469 if (ds_next != ds_prev) {
470 /* we clear debugctl to make sure DS
471 * is not in use when we change it */
472 debugctl = 0;
473 update_debugctlmsr(0);
474 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
475 }
476 return debugctl;
477}
478#else
479static int update_debugctl(struct thread_struct *prev,
480 struct thread_struct *next, unsigned long debugctl)
481{
482 return debugctl;
483}
484#endif /* CONFIG_X86_DS */
485
441static noinline void 486static noinline void
442__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 487__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
443 struct tss_struct *tss) 488 struct tss_struct *tss)
@@ -448,14 +493,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
448 prev = &prev_p->thread; 493 prev = &prev_p->thread;
449 next = &next_p->thread; 494 next = &next_p->thread;
450 495
451 debugctl = prev->debugctlmsr; 496 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
452 if (next->ds_area_msr != prev->ds_area_msr) {
453 /* we clear debugctl to make sure DS
454 * is not in use when we change it */
455 debugctl = 0;
456 update_debugctlmsr(0);
457 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
458 }
459 497
460 if (next->debugctlmsr != debugctl) 498 if (next->debugctlmsr != debugctl)
461 update_debugctlmsr(next->debugctlmsr); 499 update_debugctlmsr(next->debugctlmsr);
@@ -479,13 +517,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
479 hard_enable_TSC(); 517 hard_enable_TSC();
480 } 518 }
481 519
482#ifdef X86_BTS 520#ifdef CONFIG_X86_PTRACE_BTS
483 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 521 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
484 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 522 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
485 523
486 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 524 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
487 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 525 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
488#endif 526#endif /* CONFIG_X86_PTRACE_BTS */
489 527
490 528
491 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 529 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7502508a7664..b6b508ea7110 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -94,6 +94,8 @@ DECLARE_PER_CPU(int, cpu_state);
94static inline void play_dead(void) 94static inline void play_dead(void)
95{ 95{
96 idle_task_exit(); 96 idle_task_exit();
97 c1e_remove_cpu(raw_smp_processor_id());
98
97 mb(); 99 mb();
98 /* Ack it */ 100 /* Ack it */
99 __get_cpu_var(cpu_state) = CPU_DEAD; 101 __get_cpu_var(cpu_state) = CPU_DEAD;
@@ -241,6 +243,14 @@ void exit_thread(void)
241 t->io_bitmap_max = 0; 243 t->io_bitmap_max = 0;
242 put_cpu(); 244 put_cpu();
243 } 245 }
246#ifdef CONFIG_X86_DS
247 /* Free any DS contexts that have not been properly released. */
248 if (unlikely(t->ds_ctx)) {
249 /* we clear debugctl to make sure DS is not used. */
250 update_debugctlmsr(0);
251 ds_free(t->ds_ctx);
252 }
253#endif /* CONFIG_X86_DS */
244} 254}
245 255
246void flush_thread(void) 256void flush_thread(void)
@@ -474,13 +484,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
474 next = &next_p->thread; 484 next = &next_p->thread;
475 485
476 debugctl = prev->debugctlmsr; 486 debugctl = prev->debugctlmsr;
477 if (next->ds_area_msr != prev->ds_area_msr) { 487
478 /* we clear debugctl to make sure DS 488#ifdef CONFIG_X86_DS
479 * is not in use when we change it */ 489 {
480 debugctl = 0; 490 unsigned long ds_prev = 0, ds_next = 0;
481 update_debugctlmsr(0); 491
482 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 492 if (prev->ds_ctx)
493 ds_prev = (unsigned long)prev->ds_ctx->ds;
494 if (next->ds_ctx)
495 ds_next = (unsigned long)next->ds_ctx->ds;
496
497 if (ds_next != ds_prev) {
498 /*
499 * We clear debugctl to make sure DS
500 * is not in use when we change it:
501 */
502 debugctl = 0;
503 update_debugctlmsr(0);
504 wrmsrl(MSR_IA32_DS_AREA, ds_next);
505 }
483 } 506 }
507#endif /* CONFIG_X86_DS */
484 508
485 if (next->debugctlmsr != debugctl) 509 if (next->debugctlmsr != debugctl)
486 update_debugctlmsr(next->debugctlmsr); 510 update_debugctlmsr(next->debugctlmsr);
@@ -518,13 +542,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
518 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 542 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
519 } 543 }
520 544
521#ifdef X86_BTS 545#ifdef CONFIG_X86_PTRACE_BTS
522 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 546 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
523 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 547 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
524 548
525 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 549 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
526 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 550 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
527#endif 551#endif /* CONFIG_X86_PTRACE_BTS */
528} 552}
529 553
530/* 554/*
@@ -730,12 +754,12 @@ unsigned long get_wchan(struct task_struct *p)
730 if (!p || p == current || p->state == TASK_RUNNING) 754 if (!p || p == current || p->state == TASK_RUNNING)
731 return 0; 755 return 0;
732 stack = (unsigned long)task_stack_page(p); 756 stack = (unsigned long)task_stack_page(p);
733 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 757 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
734 return 0; 758 return 0;
735 fp = *(u64 *)(p->thread.sp); 759 fp = *(u64 *)(p->thread.sp);
736 do { 760 do {
737 if (fp < (unsigned long)stack || 761 if (fp < (unsigned long)stack ||
738 fp > (unsigned long)stack+THREAD_SIZE) 762 fp >= (unsigned long)stack+THREAD_SIZE)
739 return 0; 763 return 0;
740 ip = *(u64 *)(fp+8); 764 ip = *(u64 *)(fp+8);
741 if (!in_sched_functions(ip)) 765 if (!in_sched_functions(ip))
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index bf45cdf1aaca..42ec4421e10b 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -555,45 +555,115 @@ static int ptrace_set_debugreg(struct task_struct *child,
555 return 0; 555 return 0;
556} 556}
557 557
558#ifdef X86_BTS 558#ifdef CONFIG_X86_PTRACE_BTS
559/*
560 * The configuration for a particular BTS hardware implementation.
561 */
562struct bts_configuration {
563 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
564 unsigned char sizeof_bts;
565 /* the size of a field in the BTS record in bytes */
566 unsigned char sizeof_field;
567 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
568 unsigned long debugctl_mask;
569};
570static struct bts_configuration bts_cfg;
571
572#define BTS_MAX_RECORD_SIZE (8 * 3)
573
574
575/*
576 * Branch Trace Store (BTS) uses the following format. Different
577 * architectures vary in the size of those fields.
578 * - source linear address
579 * - destination linear address
580 * - flags
581 *
582 * Later architectures use 64bit pointers throughout, whereas earlier
583 * architectures use 32bit pointers in 32bit mode.
584 *
585 * We compute the base address for the first 8 fields based on:
586 * - the field size stored in the DS configuration
587 * - the relative field position
588 *
589 * In order to store additional information in the BTS buffer, we use
590 * a special source address to indicate that the record requires
591 * special interpretation.
592 *
593 * Netburst indicated via a bit in the flags field whether the branch
594 * was predicted; this is ignored.
595 */
596
597enum bts_field {
598 bts_from = 0,
599 bts_to,
600 bts_flags,
601
602 bts_escape = (unsigned long)-1,
603 bts_qual = bts_to,
604 bts_jiffies = bts_flags
605};
559 606
560static int ptrace_bts_get_size(struct task_struct *child) 607static inline unsigned long bts_get(const char *base, enum bts_field field)
561{ 608{
562 if (!child->thread.ds_area_msr) 609 base += (bts_cfg.sizeof_field * field);
563 return -ENXIO; 610 return *(unsigned long *)base;
611}
612
613static inline void bts_set(char *base, enum bts_field field, unsigned long val)
614{
615 base += (bts_cfg.sizeof_field * field);;
616 (*(unsigned long *)base) = val;
617}
564 618
565 return ds_get_bts_index((void *)child->thread.ds_area_msr); 619/*
620 * Translate a BTS record from the raw format into the bts_struct format
621 *
622 * out (out): bts_struct interpretation
623 * raw: raw BTS record
624 */
625static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
626{
627 memset(out, 0, sizeof(*out));
628 if (bts_get(raw, bts_from) == bts_escape) {
629 out->qualifier = bts_get(raw, bts_qual);
630 out->variant.jiffies = bts_get(raw, bts_jiffies);
631 } else {
632 out->qualifier = BTS_BRANCH;
633 out->variant.lbr.from_ip = bts_get(raw, bts_from);
634 out->variant.lbr.to_ip = bts_get(raw, bts_to);
635 }
566} 636}
567 637
568static int ptrace_bts_read_record(struct task_struct *child, 638static int ptrace_bts_read_record(struct task_struct *child, size_t index,
569 long index,
570 struct bts_struct __user *out) 639 struct bts_struct __user *out)
571{ 640{
572 struct bts_struct ret; 641 struct bts_struct ret;
573 int retval; 642 const void *bts_record;
574 int bts_end; 643 size_t bts_index, bts_end;
575 int bts_index; 644 int error;
576 645
577 if (!child->thread.ds_area_msr) 646 error = ds_get_bts_end(child, &bts_end);
578 return -ENXIO; 647 if (error < 0)
648 return error;
579 649
580 if (index < 0)
581 return -EINVAL;
582
583 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
584 if (bts_end <= index) 650 if (bts_end <= index)
585 return -EINVAL; 651 return -EINVAL;
586 652
653 error = ds_get_bts_index(child, &bts_index);
654 if (error < 0)
655 return error;
656
587 /* translate the ptrace bts index into the ds bts index */ 657 /* translate the ptrace bts index into the ds bts index */
588 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 658 bts_index += bts_end - (index + 1);
589 bts_index -= (index + 1); 659 if (bts_end <= bts_index)
590 if (bts_index < 0) 660 bts_index -= bts_end;
591 bts_index += bts_end; 661
662 error = ds_access_bts(child, bts_index, &bts_record);
663 if (error < 0)
664 return error;
592 665
593 retval = ds_read_bts((void *)child->thread.ds_area_msr, 666 ptrace_bts_translate_record(&ret, bts_record);
594 bts_index, &ret);
595 if (retval < 0)
596 return retval;
597 667
598 if (copy_to_user(out, &ret, sizeof(ret))) 668 if (copy_to_user(out, &ret, sizeof(ret)))
599 return -EFAULT; 669 return -EFAULT;
@@ -601,101 +671,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
601 return sizeof(ret); 671 return sizeof(ret);
602} 672}
603 673
604static int ptrace_bts_clear(struct task_struct *child)
605{
606 if (!child->thread.ds_area_msr)
607 return -ENXIO;
608
609 return ds_clear((void *)child->thread.ds_area_msr);
610}
611
612static int ptrace_bts_drain(struct task_struct *child, 674static int ptrace_bts_drain(struct task_struct *child,
613 long size, 675 long size,
614 struct bts_struct __user *out) 676 struct bts_struct __user *out)
615{ 677{
616 int end, i; 678 struct bts_struct ret;
617 void *ds = (void *)child->thread.ds_area_msr; 679 const unsigned char *raw;
618 680 size_t end, i;
619 if (!ds) 681 int error;
620 return -ENXIO;
621 682
622 end = ds_get_bts_index(ds); 683 error = ds_get_bts_index(child, &end);
623 if (end <= 0) 684 if (error < 0)
624 return end; 685 return error;
625 686
626 if (size < (end * sizeof(struct bts_struct))) 687 if (size < (end * sizeof(struct bts_struct)))
627 return -EIO; 688 return -EIO;
628 689
629 for (i = 0; i < end; i++, out++) { 690 error = ds_access_bts(child, 0, (const void **)&raw);
630 struct bts_struct ret; 691 if (error < 0)
631 int retval; 692 return error;
632 693
633 retval = ds_read_bts(ds, i, &ret); 694 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
634 if (retval < 0) 695 ptrace_bts_translate_record(&ret, raw);
635 return retval;
636 696
637 if (copy_to_user(out, &ret, sizeof(ret))) 697 if (copy_to_user(out, &ret, sizeof(ret)))
638 return -EFAULT; 698 return -EFAULT;
639 } 699 }
640 700
641 ds_clear(ds); 701 error = ds_clear_bts(child);
702 if (error < 0)
703 return error;
642 704
643 return end; 705 return end;
644} 706}
645 707
708static void ptrace_bts_ovfl(struct task_struct *child)
709{
710 send_sig(child->thread.bts_ovfl_signal, child, 0);
711}
712
646static int ptrace_bts_config(struct task_struct *child, 713static int ptrace_bts_config(struct task_struct *child,
647 long cfg_size, 714 long cfg_size,
648 const struct ptrace_bts_config __user *ucfg) 715 const struct ptrace_bts_config __user *ucfg)
649{ 716{
650 struct ptrace_bts_config cfg; 717 struct ptrace_bts_config cfg;
651 int bts_size, ret = 0; 718 int error = 0;
652 void *ds; 719
720 error = -EOPNOTSUPP;
721 if (!bts_cfg.sizeof_bts)
722 goto errout;
653 723
724 error = -EIO;
654 if (cfg_size < sizeof(cfg)) 725 if (cfg_size < sizeof(cfg))
655 return -EIO; 726 goto errout;
656 727
728 error = -EFAULT;
657 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 729 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
658 return -EFAULT; 730 goto errout;
659 731
660 if ((int)cfg.size < 0) 732 error = -EINVAL;
661 return -EINVAL; 733 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
734 !(cfg.flags & PTRACE_BTS_O_ALLOC))
735 goto errout;
662 736
663 bts_size = 0; 737 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
664 ds = (void *)child->thread.ds_area_msr; 738 ds_ovfl_callback_t ovfl = NULL;
665 if (ds) { 739 unsigned int sig = 0;
666 bts_size = ds_get_bts_size(ds); 740
667 if (bts_size < 0) 741 /* we ignore the error in case we were not tracing child */
668 return bts_size; 742 (void)ds_release_bts(child);
669 } 743
670 cfg.size = PAGE_ALIGN(cfg.size); 744 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
745 if (!cfg.signal)
746 goto errout;
747
748 sig = cfg.signal;
749 ovfl = ptrace_bts_ovfl;
750 }
671 751
672 if (bts_size != cfg.size) { 752 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
673 ret = ptrace_bts_realloc(child, cfg.size, 753 if (error < 0)
674 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
675 if (ret < 0)
676 goto errout; 754 goto errout;
677 755
678 ds = (void *)child->thread.ds_area_msr; 756 child->thread.bts_ovfl_signal = sig;
679 } 757 }
680 758
681 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 759 error = -EINVAL;
682 ret = ds_set_overflow(ds, DS_O_SIGNAL); 760 if (!child->thread.ds_ctx && cfg.flags)
683 else
684 ret = ds_set_overflow(ds, DS_O_WRAP);
685 if (ret < 0)
686 goto errout; 761 goto errout;
687 762
688 if (cfg.flags & PTRACE_BTS_O_TRACE) 763 if (cfg.flags & PTRACE_BTS_O_TRACE)
689 child->thread.debugctlmsr |= ds_debugctl_mask(); 764 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
690 else 765 else
691 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 766 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
692 767
693 if (cfg.flags & PTRACE_BTS_O_SCHED) 768 if (cfg.flags & PTRACE_BTS_O_SCHED)
694 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 769 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
695 else 770 else
696 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 771 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
697 772
698 ret = sizeof(cfg); 773 error = sizeof(cfg);
699 774
700out: 775out:
701 if (child->thread.debugctlmsr) 776 if (child->thread.debugctlmsr)
@@ -703,10 +778,10 @@ out:
703 else 778 else
704 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 779 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
705 780
706 return ret; 781 return error;
707 782
708errout: 783errout:
709 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 784 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
710 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 785 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
711 goto out; 786 goto out;
712} 787}
@@ -715,29 +790,40 @@ static int ptrace_bts_status(struct task_struct *child,
715 long cfg_size, 790 long cfg_size,
716 struct ptrace_bts_config __user *ucfg) 791 struct ptrace_bts_config __user *ucfg)
717{ 792{
718 void *ds = (void *)child->thread.ds_area_msr;
719 struct ptrace_bts_config cfg; 793 struct ptrace_bts_config cfg;
794 size_t end;
795 const void *base, *max;
796 int error;
720 797
721 if (cfg_size < sizeof(cfg)) 798 if (cfg_size < sizeof(cfg))
722 return -EIO; 799 return -EIO;
723 800
724 memset(&cfg, 0, sizeof(cfg)); 801 error = ds_get_bts_end(child, &end);
802 if (error < 0)
803 return error;
725 804
726 if (ds) { 805 error = ds_access_bts(child, /* index = */ 0, &base);
727 cfg.size = ds_get_bts_size(ds); 806 if (error < 0)
807 return error;
728 808
729 if (ds_get_overflow(ds) == DS_O_SIGNAL) 809 error = ds_access_bts(child, /* index = */ end, &max);
730 cfg.flags |= PTRACE_BTS_O_SIGNAL; 810 if (error < 0)
811 return error;
731 812
732 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 813 memset(&cfg, 0, sizeof(cfg));
733 child->thread.debugctlmsr & ds_debugctl_mask()) 814 cfg.size = (max - base);
734 cfg.flags |= PTRACE_BTS_O_TRACE; 815 cfg.signal = child->thread.bts_ovfl_signal;
816 cfg.bts_size = sizeof(struct bts_struct);
735 817
736 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 818 if (cfg.signal)
737 cfg.flags |= PTRACE_BTS_O_SCHED; 819 cfg.flags |= PTRACE_BTS_O_SIGNAL;
738 }
739 820
740 cfg.bts_size = sizeof(struct bts_struct); 821 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
822 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
823 cfg.flags |= PTRACE_BTS_O_TRACE;
824
825 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
826 cfg.flags |= PTRACE_BTS_O_SCHED;
741 827
742 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 828 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
743 return -EFAULT; 829 return -EFAULT;
@@ -745,89 +831,38 @@ static int ptrace_bts_status(struct task_struct *child,
745 return sizeof(cfg); 831 return sizeof(cfg);
746} 832}
747 833
748
749static int ptrace_bts_write_record(struct task_struct *child, 834static int ptrace_bts_write_record(struct task_struct *child,
750 const struct bts_struct *in) 835 const struct bts_struct *in)
751{ 836{
752 int retval; 837 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
753 838
754 if (!child->thread.ds_area_msr) 839 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
755 return -ENXIO;
756 840
757 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 841 memset(bts_record, 0, bts_cfg.sizeof_bts);
758 if (retval) 842 switch (in->qualifier) {
759 return retval; 843 case BTS_INVALID:
844 break;
760 845
761 return sizeof(*in); 846 case BTS_BRANCH:
762} 847 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
848 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
849 break;
763 850
764static int ptrace_bts_realloc(struct task_struct *child, 851 case BTS_TASK_ARRIVES:
765 int size, int reduce_size) 852 case BTS_TASK_DEPARTS:
766{ 853 bts_set(bts_record, bts_from, bts_escape);
767 unsigned long rlim, vm; 854 bts_set(bts_record, bts_qual, in->qualifier);
768 int ret, old_size; 855 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
856 break;
769 857
770 if (size < 0) 858 default:
771 return -EINVAL; 859 return -EINVAL;
772
773 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
774 if (old_size < 0)
775 return old_size;
776
777 ret = ds_free((void **)&child->thread.ds_area_msr);
778 if (ret < 0)
779 goto out;
780
781 size >>= PAGE_SHIFT;
782 old_size >>= PAGE_SHIFT;
783
784 current->mm->total_vm -= old_size;
785 current->mm->locked_vm -= old_size;
786
787 if (size == 0)
788 goto out;
789
790 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
791 vm = current->mm->total_vm + size;
792 if (rlim < vm) {
793 ret = -ENOMEM;
794
795 if (!reduce_size)
796 goto out;
797
798 size = rlim - current->mm->total_vm;
799 if (size <= 0)
800 goto out;
801 } 860 }
802 861
803 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 862 /* The writing task will be the switched-to task on a context
804 vm = current->mm->locked_vm + size; 863 * switch. It needs to write into the switched-from task's BTS
805 if (rlim < vm) { 864 * buffer. */
806 ret = -ENOMEM; 865 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
807
808 if (!reduce_size)
809 goto out;
810
811 size = rlim - current->mm->locked_vm;
812 if (size <= 0)
813 goto out;
814 }
815
816 ret = ds_allocate((void **)&child->thread.ds_area_msr,
817 size << PAGE_SHIFT);
818 if (ret < 0)
819 goto out;
820
821 current->mm->total_vm += size;
822 current->mm->locked_vm += size;
823
824out:
825 if (child->thread.ds_area_msr)
826 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
827 else
828 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
829
830 return ret;
831} 866}
832 867
833void ptrace_bts_take_timestamp(struct task_struct *tsk, 868void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -840,7 +875,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
840 875
841 ptrace_bts_write_record(tsk, &rec); 876 ptrace_bts_write_record(tsk, &rec);
842} 877}
843#endif /* X86_BTS */ 878
879static const struct bts_configuration bts_cfg_netburst = {
880 .sizeof_bts = sizeof(long) * 3,
881 .sizeof_field = sizeof(long),
882 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
883};
884
885static const struct bts_configuration bts_cfg_pentium_m = {
886 .sizeof_bts = sizeof(long) * 3,
887 .sizeof_field = sizeof(long),
888 .debugctl_mask = (1<<6)|(1<<7)
889};
890
891static const struct bts_configuration bts_cfg_core2 = {
892 .sizeof_bts = 8 * 3,
893 .sizeof_field = 8,
894 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
895};
896
897static inline void bts_configure(const struct bts_configuration *cfg)
898{
899 bts_cfg = *cfg;
900}
901
902void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
903{
904 switch (c->x86) {
905 case 0x6:
906 switch (c->x86_model) {
907 case 0xD:
908 case 0xE: /* Pentium M */
909 bts_configure(&bts_cfg_pentium_m);
910 break;
911 case 0xF: /* Core2 */
912 case 0x1C: /* Atom */
913 bts_configure(&bts_cfg_core2);
914 break;
915 default:
916 /* sorry, don't know about them */
917 break;
918 }
919 break;
920 case 0xF:
921 switch (c->x86_model) {
922 case 0x0:
923 case 0x1:
924 case 0x2: /* Netburst */
925 bts_configure(&bts_cfg_netburst);
926 break;
927 default:
928 /* sorry, don't know about them */
929 break;
930 }
931 break;
932 default:
933 /* sorry, don't know about them */
934 break;
935 }
936}
937#endif /* CONFIG_X86_PTRACE_BTS */
844 938
845/* 939/*
846 * Called by kernel/ptrace.c when detaching.. 940 * Called by kernel/ptrace.c when detaching..
@@ -853,15 +947,15 @@ void ptrace_disable(struct task_struct *child)
853#ifdef TIF_SYSCALL_EMU 947#ifdef TIF_SYSCALL_EMU
854 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 948 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
855#endif 949#endif
856 if (child->thread.ds_area_msr) { 950#ifdef CONFIG_X86_PTRACE_BTS
857#ifdef X86_BTS 951 (void)ds_release_bts(child);
858 ptrace_bts_realloc(child, 0, 0); 952
859#endif 953 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
860 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 954 if (!child->thread.debugctlmsr)
861 if (!child->thread.debugctlmsr) 955 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
862 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 956
863 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 957 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
864 } 958#endif /* CONFIG_X86_PTRACE_BTS */
865} 959}
866 960
867#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 961#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -981,7 +1075,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
981 /* 1075 /*
982 * These bits need more cooking - not enabled yet: 1076 * These bits need more cooking - not enabled yet:
983 */ 1077 */
984#ifdef X86_BTS 1078#ifdef CONFIG_X86_PTRACE_BTS
985 case PTRACE_BTS_CONFIG: 1079 case PTRACE_BTS_CONFIG:
986 ret = ptrace_bts_config 1080 ret = ptrace_bts_config
987 (child, data, (struct ptrace_bts_config __user *)addr); 1081 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -993,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
993 break; 1087 break;
994 1088
995 case PTRACE_BTS_SIZE: 1089 case PTRACE_BTS_SIZE:
996 ret = ptrace_bts_get_size(child); 1090 ret = ds_get_bts_index(child, /* pos = */ NULL);
997 break; 1091 break;
998 1092
999 case PTRACE_BTS_GET: 1093 case PTRACE_BTS_GET:
@@ -1002,14 +1096,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1002 break; 1096 break;
1003 1097
1004 case PTRACE_BTS_CLEAR: 1098 case PTRACE_BTS_CLEAR:
1005 ret = ptrace_bts_clear(child); 1099 ret = ds_clear_bts(child);
1006 break; 1100 break;
1007 1101
1008 case PTRACE_BTS_DRAIN: 1102 case PTRACE_BTS_DRAIN:
1009 ret = ptrace_bts_drain 1103 ret = ptrace_bts_drain
1010 (child, data, (struct bts_struct __user *) addr); 1104 (child, data, (struct bts_struct __user *) addr);
1011 break; 1105 break;
1012#endif 1106#endif /* CONFIG_X86_PTRACE_BTS */
1013 1107
1014 default: 1108 default:
1015 ret = ptrace_request(child, request, addr, data); 1109 ret = ptrace_request(child, request, addr, data);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb9..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c6b9330c1bff..46c98efbbf8d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
223#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
224 224
225static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
226 229
227#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
228struct edd edd; 231struct edd edd;
@@ -665,6 +668,19 @@ void __init setup_arch(char **cmdline_p)
665 bss_resource.start = virt_to_phys(&__bss_start); 668 bss_resource.start = virt_to_phys(&__bss_start);
666 bss_resource.end = virt_to_phys(&__bss_stop)-1; 669 bss_resource.end = virt_to_phys(&__bss_stop)-1;
667 670
671#ifdef CONFIG_CMDLINE_BOOL
672#ifdef CONFIG_CMDLINE_OVERRIDE
673 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
674#else
675 if (builtin_cmdline[0]) {
676 /* append boot loader cmdline to builtin */
677 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
678 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
679 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
680 }
681#endif
682#endif
683
668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 684 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
669 *cmdline_p = command_line; 685 *cmdline_p = command_line;
670 686
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 76e305e064f9..0e67f72d9316 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -162,9 +162,16 @@ void __init setup_per_cpu_areas(void)
162 printk(KERN_INFO 162 printk(KERN_INFO
163 "cpu %d has no node %d or node-local memory\n", 163 "cpu %d has no node %d or node-local memory\n",
164 cpu, node); 164 cpu, node);
165 if (ptr)
166 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
167 cpu, __pa(ptr));
165 } 168 }
166 else 169 else {
167 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); 170 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
171 if (ptr)
172 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
173 cpu, node, __pa(ptr));
174 }
168#endif 175#endif
169 per_cpu_offset(cpu) = ptr - __per_cpu_start; 176 per_cpu_offset(cpu) = ptr - __per_cpu_start;
170 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 177 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 2ff0bbcd5bd1..9056f7e272c0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -258,6 +258,7 @@ static void __cpuinit smp_callin(void)
258 end_local_APIC_setup(); 258 end_local_APIC_setup();
259 map_cpu_to_logical_apicid(); 259 map_cpu_to_logical_apicid();
260 260
261 notify_cpu_starting(cpuid);
261 /* 262 /*
262 * Get our bogomips. 263 * Get our bogomips.
263 * 264 *
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 011d8e1fac6e..9c0ac0cab013 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -340,9 +340,8 @@ static void
340show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 340show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
341 unsigned long *stack, unsigned long bp, char *log_lvl) 341 unsigned long *stack, unsigned long bp, char *log_lvl)
342{ 342{
343 printk("\nCall Trace:\n"); 343 printk("Call Trace:\n");
344 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 344 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
345 printk("\n");
346} 345}
347 346
348void show_trace(struct task_struct *task, struct pt_regs *regs, 347void show_trace(struct task_struct *task, struct pt_regs *regs,
@@ -391,6 +390,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
391 printk(" %016lx", *stack++); 390 printk(" %016lx", *stack++);
392 touch_nmi_watchdog(); 391 touch_nmi_watchdog();
393 } 392 }
393 printk("\n");
394 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 394 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
395} 395}
396 396
@@ -447,7 +447,6 @@ void show_registers(struct pt_regs *regs)
447 printk("Stack: "); 447 printk("Stack: ");
448 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 448 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
449 regs->bp, ""); 449 regs->bp, "");
450 printk("\n");
451 450
452 printk(KERN_EMERG "Code: "); 451 printk(KERN_EMERG "Code: ");
453 452
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f98e9de1b82..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
123} 123}
124 124
125/* 125/*
126 * Calculate the TSC frequency from HPET reference
127 */
128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{
130 u64 tmp;
131
132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
148
149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
126 * Try to calibrate the TSC against the Programmable 172 * Try to calibrate the TSC against the Programmable
127 * Interrupt Timer and return the frequency of the TSC 173 * Interrupt Timer and return the frequency of the TSC
128 * in kHz. 174 * in kHz.
129 * 175 *
130 * Return ULONG_MAX on failure to calibrate. 176 * Return ULONG_MAX on failure to calibrate.
131 */ 177 */
132static unsigned long pit_calibrate_tsc(void) 178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
133{ 179{
134 u64 tsc, t1, t2, delta; 180 u64 tsc, t1, t2, delta;
135 unsigned long tscmin, tscmax; 181 unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
144 * (LSB then MSB) to begin countdown. 190 * (LSB then MSB) to begin countdown.
145 */ 191 */
146 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
147 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
148 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
149 195
150 tsc = t1 = t2 = get_cycles(); 196 tsc = t1 = t2 = get_cycles();
151 197
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
166 /* 212 /*
167 * Sanity checks: 213 * Sanity checks:
168 * 214 *
169 * If we were not able to read the PIT more than 5000 215 * If we were not able to read the PIT more than loopmin
170 * times, then we have been hit by a massive SMI 216 * times, then we have been hit by a massive SMI
171 * 217 *
172 * If the maximum is 10 times larger than the minimum, 218 * If the maximum is 10 times larger than the minimum,
173 * then we got hit by an SMI as well. 219 * then we got hit by an SMI as well.
174 */ 220 */
175 if (pitcnt < 5000 || tscmax > 10 * tscmin) 221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
176 return ULONG_MAX; 222 return ULONG_MAX;
177 223
178 /* Calculate the PIT value */ 224 /* Calculate the PIT value */
179 delta = t2 - t1; 225 delta = t2 - t1;
180 do_div(delta, 50); 226 do_div(delta, ms);
181 return delta; 227 return delta;
182} 228}
183 229
230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
184 347
185/** 348/**
186 * native_calibrate_tsc - calibrate the tsc on boot 349 * native_calibrate_tsc - calibrate the tsc on boot
187 */ 350 */
188unsigned long native_calibrate_tsc(void) 351unsigned long native_calibrate_tsc(void)
189{ 352{
190 u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; 353 u64 tsc1, tsc2, delta, ref1, ref2;
191 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
192 unsigned long flags; 355 unsigned long flags, latch, ms, fast_calibrate;
193 int hpet = is_hpet_enabled(), i; 356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
194 363
195 /* 364 /*
196 * Run 5 calibration loops to get the lowest frequency value 365 * Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
216 * calibration delay loop as we have to wait for a certain 385 * calibration delay loop as we have to wait for a certain
217 * amount of time anyway. 386 * amount of time anyway.
218 */ 387 */
219 for (i = 0; i < 5; i++) { 388
389 /* Preset PIT loop values */
390 latch = CAL_LATCH;
391 ms = CAL_MS;
392 loopmin = CAL_PIT_LOOPS;
393
394 for (i = 0; i < 3; i++) {
220 unsigned long tsc_pit_khz; 395 unsigned long tsc_pit_khz;
221 396
222 /* 397 /*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
226 * read the end value. 401 * read the end value.
227 */ 402 */
228 local_irq_save(flags); 403 local_irq_save(flags);
229 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 404 tsc1 = tsc_read_refs(&ref1, hpet);
230 tsc_pit_khz = pit_calibrate_tsc(); 405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
231 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 406 tsc2 = tsc_read_refs(&ref2, hpet);
232 local_irq_restore(flags); 407 local_irq_restore(flags);
233 408
234 /* Pick the lowest PIT TSC calibration so far */ 409 /* Pick the lowest PIT TSC calibration so far */
235 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
236 411
237 /* hpet or pmtimer available ? */ 412 /* hpet or pmtimer available ? */
238 if (!hpet && !pm1 && !pm2) 413 if (!hpet && !ref1 && !ref2)
239 continue; 414 continue;
240 415
241 /* Check, whether the sampling was disturbed by an SMI */ 416 /* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
243 continue; 418 continue;
244 419
245 tsc2 = (tsc2 - tsc1) * 1000000LL; 420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
246 425
247 if (hpet) { 426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
248 if (hpet2 < hpet1) 427
249 hpet2 += 0x100000000ULL; 428 /* Check the reference deviation */
250 hpet2 -= hpet1; 429 delta = ((u64) tsc_pit_min) * 100;
251 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); 430 do_div(delta, tsc_ref_min);
252 do_div(tsc1, 1000000); 431
253 } else { 432 /*
254 if (pm2 < pm1) 433 * If both calibration results are inside a 10% window
255 pm2 += (u64)ACPI_PM_OVRRUN; 434 * then we can be sure, that the calibration
256 pm2 -= pm1; 435 * succeeded. We break out of the loop right away. We
257 tsc1 = pm2 * 1000000000LL; 436 * use the reference value, as it is more precise.
258 do_div(tsc1, PMTMR_TICKS_PER_SEC); 437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
259 } 443 }
260 444
261 do_div(tsc2, tsc1); 445 /*
262 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); 446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
263 } 456 }
264 457
265 /* 458 /*
@@ -270,7 +463,7 @@ unsigned long native_calibrate_tsc(void)
270 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); 463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
271 464
272 /* We don't have an alternative source, disable TSC */ 465 /* We don't have an alternative source, disable TSC */
273 if (!hpet && !pm1 && !pm2) { 466 if (!hpet && !ref1 && !ref2) {
274 printk("TSC: No reference (HPET/PMTIMER) available\n"); 467 printk("TSC: No reference (HPET/PMTIMER) available\n");
275 return 0; 468 return 0;
276 } 469 }
@@ -278,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
278 /* The alternative source failed as well, disable TSC */ 471 /* The alternative source failed as well, disable TSC */
279 if (tsc_ref_min == ULONG_MAX) { 472 if (tsc_ref_min == ULONG_MAX) {
280 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " 473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
281 "failed due to SMI disturbance.\n"); 474 "failed.\n");
282 return 0; 475 return 0;
283 } 476 }
284 477
@@ -290,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
290 } 483 }
291 484
292 /* We don't have an alternative source, use the PIT calibration value */ 485 /* We don't have an alternative source, use the PIT calibration value */
293 if (!hpet && !pm1 && !pm2) { 486 if (!hpet && !ref1 && !ref2) {
294 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
295 return tsc_pit_min; 488 return tsc_pit_min;
296 } 489 }
297 490
298 /* The alternative source failed, use the PIT calibration value */ 491 /* The alternative source failed, use the PIT calibration value */
299 if (tsc_ref_min == ULONG_MAX) { 492 if (tsc_ref_min == ULONG_MAX) {
300 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " 493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
301 "to SMI disturbance. Using PIT calibration\n"); 494 "Using PIT calibration\n");
302 return tsc_pit_min; 495 return tsc_pit_min;
303 } 496 }
304 497
305 /* Check the reference deviation */
306 delta = ((u64) tsc_pit_min) * 100;
307 do_div(delta, tsc_ref_min);
308
309 /*
310 * If both calibration results are inside a 5% window, the we
311 * use the lower frequency of those as it is probably the
312 * closest estimate.
313 */
314 if (delta >= 95 && delta <= 105) {
315 printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
316 hpet ? "HPET" : "PMTIMER");
317 printk(KERN_INFO "TSC: using %s calibration value\n",
318 tsc_pit_min <= tsc_ref_min ? "PIT" :
319 hpet ? "HPET" : "PMTIMER");
320 return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
321 }
322
323 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
324 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
325
326 /* 498 /*
327 * The calibration values differ too much. In doubt, we use 499 * The calibration values differ too much. In doubt, we use
328 * the PIT value as we know that there are PMTIMERs around 500 * the PIT value as we know that there are PMTIMERs around
329 * running at double speed. 501 * running at double speed. At least we let the user know:
330 */ 502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
331 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
332 return tsc_pit_min; 506 return tsc_pit_min;
333} 507}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 61531d5c9507..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
235 const void *desc) 235 const void *desc)
236{ 236{
237 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
238 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
239} 239}
240 240
241static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
393} 393}
394#endif 394#endif
395 395
396static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 397{
398 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 400}
401 401
402static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
403{ 403{
404 /* 404 /*
405 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 411}
412 412
413static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 414{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 418}
419 419
420static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
421{ 421{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 424}
425 425
426static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
427{ 427{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index af5bdad84604..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
140 *(.con_initcall.init) 140 *(.con_initcall.init)
141 __con_initcall_end = .; 141 __con_initcall_end = .;
142 } 142 }
143 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 143 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
144 __x86cpuvendor_start = .; 144 __x86_cpu_dev_start = .;
145 *(.x86cpuvendor.init) 145 *(.x86_cpu_dev.init)
146 __x86cpuvendor_end = .; 146 __x86_cpu_dev_end = .;
147 } 147 }
148 SECURITY_INIT 148 SECURITY_INIT
149 . = ALIGN(4); 149 . = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
180 . = ALIGN(PAGE_SIZE); 180 . = ALIGN(PAGE_SIZE);
181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 __per_cpu_start = .; 182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
183 *(.data.percpu) 184 *(.data.percpu)
184 *(.data.percpu.shared_aligned) 185 *(.data.percpu.shared_aligned)
185 __per_cpu_end = .; 186 __per_cpu_end = .;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e88..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
168 *(.con_initcall.init) 168 *(.con_initcall.init)
169 } 169 }
170 __con_initcall_end = .; 170 __con_initcall_end = .;
171 . = ALIGN(16); 171 __x86_cpu_dev_start = .;
172 __x86cpuvendor_start = .; 172 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
173 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 173 *(.x86_cpu_dev.init)
174 *(.x86cpuvendor.init)
175 } 174 }
176 __x86cpuvendor_end = .; 175 __x86_cpu_dev_end = .;
177 SECURITY_INIT 176 SECURITY_INIT
178 177
179 . = ALIGN(8); 178 . = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {