aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/acpi/boot.c25
-rw-r--r--arch/x86/kernel/amd_iommu.c350
-rw-r--r--arch/x86/kernel/amd_iommu_init.c194
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic_32.c332
-rw-r--r--arch/x86/kernel/apic_64.c389
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/bios_uv.c10
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c545
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/centaur.c1
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c3
-rw-r--r--arch/x86/kernel/cpu/common.c495
-rw-r--r--arch/x86/kernel/cpu/common_64.c816
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c13
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c42
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c292
-rw-r--r--arch/x86/kernel/cpu/intel_64.c99
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c274
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c86
-rw-r--r--arch/x86/kernel/cpu/transmeta.c29
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_64.c13
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/efi.c6
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c19
-rw-r--r--arch/x86/kernel/io_delay.c8
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c18
-rw-r--r--arch/x86/kernel/pci-dma.c179
-rw-r--r--arch/x86/kernel/pci-gart_64.c162
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c62
-rw-r--r--arch/x86/kernel/process_64.c171
-rw-r--r--arch/x86/kernel/ptrace.c478
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/sigframe.h5
-rw-r--r--arch/x86/kernel/signal_32.c11
-rw-r--r--arch/x86/kernel/signal_64.c105
-rw-r--r--arch/x86/kernel/smpboot.c6
-rw-r--r--arch/x86/kernel/sys_x86_64.c43
-rw-r--r--arch/x86/kernel/traps_64.c61
-rw-r--r--arch/x86/kernel/tsc.c293
-rw-r--r--arch/x86/kernel/visws_quirks.c16
-rw-r--r--arch/x86/kernel/vmi_32.c12
-rw-r--r--arch/x86/kernel/vsmp_64.c2
71 files changed, 4073 insertions, 3068 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 27ef365e757d..c2ac1b4515a0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
58#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
59 59
60#include <asm/proto.h> 60#include <asm/proto.h>
61#include <asm/genapic.h>
62 61
63#else /* X86 */ 62#else /* X86 */
64 63
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
97#warning ACPI uses CMPXCHG, i486 and later hardware 96#warning ACPI uses CMPXCHG, i486 and later hardware
98#endif 97#endif
99 98
100static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
101
102/* -------------------------------------------------------------------------- 99/* --------------------------------------------------------------------------
103 Boot-time Configuration 100 Boot-time Configuration
104 -------------------------------------------------------------------------- */ 101 -------------------------------------------------------------------------- */
@@ -160,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
160struct acpi_mcfg_allocation *pci_mmcfg_config; 157struct acpi_mcfg_allocation *pci_mmcfg_config;
161int pci_mmcfg_config_num; 158int pci_mmcfg_config_num;
162 159
160static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
161
163static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) 162static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
164{ 163{
165 if (!strcmp(mcfg->header.oem_id, "SGI")) 164 if (!strcmp(mcfg->header.oem_id, "SGI"))
@@ -253,10 +252,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
253 return; 252 return;
254 } 253 }
255 254
256#ifdef CONFIG_X86_32
257 if (boot_cpu_physical_apicid != -1U) 255 if (boot_cpu_physical_apicid != -1U)
258 ver = apic_version[boot_cpu_physical_apicid]; 256 ver = apic_version[boot_cpu_physical_apicid];
259#endif
260 257
261 generic_processor_info(id, ver); 258 generic_processor_info(id, ver);
262} 259}
@@ -776,10 +773,8 @@ static void __init acpi_register_lapic_address(unsigned long address)
776 set_fixmap_nocache(FIX_APIC_BASE, address); 773 set_fixmap_nocache(FIX_APIC_BASE, address);
777 if (boot_cpu_physical_apicid == -1U) { 774 if (boot_cpu_physical_apicid == -1U) {
778 boot_cpu_physical_apicid = read_apic_id(); 775 boot_cpu_physical_apicid = read_apic_id();
779#ifdef CONFIG_X86_32
780 apic_version[boot_cpu_physical_apicid] = 776 apic_version[boot_cpu_physical_apicid] =
781 GET_APIC_VERSION(apic_read(APIC_LVR)); 777 GET_APIC_VERSION(apic_read(APIC_LVR));
782#endif
783 } 778 }
784} 779}
785 780
@@ -1607,6 +1602,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1607 */ 1602 */
1608 { 1603 {
1609 .callback = dmi_ignore_irq0_timer_override, 1604 .callback = dmi_ignore_irq0_timer_override,
1605 .ident = "HP nx6115 laptop",
1606 .matches = {
1607 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1608 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1609 },
1610 },
1611 {
1612 .callback = dmi_ignore_irq0_timer_override,
1610 .ident = "HP NX6125 laptop", 1613 .ident = "HP NX6125 laptop",
1611 .matches = { 1614 .matches = {
1612 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1615 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1621,6 +1624,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1621 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1624 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1622 }, 1625 },
1623 }, 1626 },
1627 {
1628 .callback = dmi_ignore_irq0_timer_override,
1629 .ident = "HP 6715b laptop",
1630 .matches = {
1631 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1632 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1633 },
1634 },
1624 {} 1635 {}
1625}; 1636};
1626 1637
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 69b4d060b21c..34e4d112b1ef 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
33 33
34static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
35 35
36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
36/* 40/*
37 * general struct to manage commands send to an IOMMU 41 * general struct to manage commands send to an IOMMU
38 */ 42 */
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
51 55
52/**************************************************************************** 56/****************************************************************************
53 * 57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
54 * IOMMU command queuing functions 154 * IOMMU command queuing functions
55 * 155 *
56 ****************************************************************************/ 156 ****************************************************************************/
@@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
101 */ 201 */
102static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
103{ 203{
104 int ret, ready = 0; 204 int ret = 0, ready = 0;
105 unsigned status = 0; 205 unsigned status = 0;
106 struct iommu_cmd cmd; 206 struct iommu_cmd cmd;
107 unsigned long i = 0; 207 unsigned long flags, i = 0;
108 208
109 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
110 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
@@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
112 212
113 iommu->need_sync = 0; 213 iommu->need_sync = 0;
114 214
115 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
116 218
117 if (ret) 219 if (ret)
118 return ret; 220 goto out;
119 221
120 while (!ready && (i < EXIT_LOOP_COUNT)) { 222 while (!ready && (i < EXIT_LOOP_COUNT)) {
121 ++i; 223 ++i;
@@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
130 232
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
133 237
134 return 0; 238 return 0;
135} 239}
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
141{ 245{
142 struct iommu_cmd cmd; 246 struct iommu_cmd cmd;
247 int ret;
143 248
144 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
145 250
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
147 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
148 cmd.data[0] = devid; 253 cmd.data[0] = devid;
149 254
255 ret = iommu_queue_command(iommu, &cmd);
256
150 iommu->need_sync = 1; 257 iommu->need_sync = 1;
151 258
152 return iommu_queue_command(iommu, &cmd); 259 return ret;
153} 260}
154 261
155/* 262/*
@@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
159 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
160{ 267{
161 struct iommu_cmd cmd; 268 struct iommu_cmd cmd;
269 int ret;
162 270
163 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
164 address &= PAGE_MASK; 272 address &= PAGE_MASK;
@@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
173 281
282 ret = iommu_queue_command(iommu, &cmd);
283
174 iommu->need_sync = 1; 284 iommu->need_sync = 1;
175 285
176 return iommu_queue_command(iommu, &cmd); 286 return ret;
177} 287}
178 288
179/* 289/*
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
203 return 0; 313 return 0;
204} 314}
205 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
206/**************************************************************************** 324/****************************************************************************
207 * 325 *
208 * The functions below are used the create the page table mappings for 326 * The functions below are used the create the page table mappings for
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
362 * efficient allocator. 480 * efficient allocator.
363 * 481 *
364 ****************************************************************************/ 482 ****************************************************************************/
365static unsigned long dma_mask_to_pages(unsigned long mask)
366{
367 return (mask >> PAGE_SHIFT) +
368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
369}
370 483
371/* 484/*
372 * The address allocator core function. 485 * The address allocator core function.
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
375 */ 488 */
376static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
377 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
378 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
379{ 494{
380 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
381 unsigned long address; 496 unsigned long address;
382 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
383 unsigned long boundary_size; 497 unsigned long boundary_size;
384 498
385 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
386 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
387 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
388 503
389 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
390 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
391 508
392 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
393 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
394 if (address == -1) 511 if (address == -1) {
395 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
396 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
397 516
398 if (likely(address != -1)) { 517 if (likely(address != -1)) {
399 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
459 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
460 pages = last_page - start_page; 579 pages = last_page - start_page;
461 580
462 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
463} 582}
464 583
465static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -553,6 +672,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
553 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
554 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
555 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
556 /* Intialize the exclusion range if necessary */ 678 /* Intialize the exclusion range if necessary */
557 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
558 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
@@ -623,12 +745,13 @@ static void set_device_domain(struct amd_iommu *iommu,
623 745
624 u64 pte_root = virt_to_phys(domain->pt_root); 746 u64 pte_root = virt_to_phys(domain->pt_root);
625 747
626 pte_root |= (domain->mode & 0x07) << 9; 748 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
627 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 749 << DEV_ENTRY_MODE_SHIFT;
750 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
628 751
629 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 752 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
630 amd_iommu_dev_table[devid].data[0] = pte_root; 753 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
631 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 754 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
632 amd_iommu_dev_table[devid].data[2] = domain->id; 755 amd_iommu_dev_table[devid].data[2] = domain->id;
633 756
634 amd_iommu_pd_table[devid] = domain; 757 amd_iommu_pd_table[devid] = domain;
@@ -646,6 +769,45 @@ static void set_device_domain(struct amd_iommu *iommu,
646 *****************************************************************************/ 769 *****************************************************************************/
647 770
648/* 771/*
772 * This function checks if the driver got a valid device from the caller to
773 * avoid dereferencing invalid pointers.
774 */
775static bool check_device(struct device *dev)
776{
777 if (!dev || !dev->dma_mask)
778 return false;
779
780 return true;
781}
782
783/*
784 * In this function the list of preallocated protection domains is traversed to
785 * find the domain for a specific device
786 */
787static struct dma_ops_domain *find_protection_domain(u16 devid)
788{
789 struct dma_ops_domain *entry, *ret = NULL;
790 unsigned long flags;
791
792 if (list_empty(&iommu_pd_list))
793 return NULL;
794
795 spin_lock_irqsave(&iommu_pd_list_lock, flags);
796
797 list_for_each_entry(entry, &iommu_pd_list, list) {
798 if (entry->target_dev == devid) {
799 ret = entry;
800 list_del(&ret->list);
801 break;
802 }
803 }
804
805 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
806
807 return ret;
808}
809
810/*
649 * In the dma_ops path we only have the struct device. This function 811 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the 812 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device. 813 * requestor id for a given device.
@@ -661,27 +823,30 @@ static int get_device_resources(struct device *dev,
661 struct pci_dev *pcidev; 823 struct pci_dev *pcidev;
662 u16 _bdf; 824 u16 _bdf;
663 825
664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 826 *iommu = NULL;
827 *domain = NULL;
828 *bdf = 0xffff;
829
830 if (dev->bus != &pci_bus_type)
831 return 0;
665 832
666 pcidev = to_pci_dev(dev); 833 pcidev = to_pci_dev(dev);
667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 834 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
668 835
669 /* device not translated by any IOMMU in the system? */ 836 /* device not translated by any IOMMU in the system? */
670 if (_bdf > amd_iommu_last_bdf) { 837 if (_bdf > amd_iommu_last_bdf)
671 *iommu = NULL;
672 *domain = NULL;
673 *bdf = 0xffff;
674 return 0; 838 return 0;
675 }
676 839
677 *bdf = amd_iommu_alias_table[_bdf]; 840 *bdf = amd_iommu_alias_table[_bdf];
678 841
679 *iommu = amd_iommu_rlookup_table[*bdf]; 842 *iommu = amd_iommu_rlookup_table[*bdf];
680 if (*iommu == NULL) 843 if (*iommu == NULL)
681 return 0; 844 return 0;
682 dma_dom = (*iommu)->default_dom;
683 *domain = domain_for_device(*bdf); 845 *domain = domain_for_device(*bdf);
684 if (*domain == NULL) { 846 if (*domain == NULL) {
847 dma_dom = find_protection_domain(*bdf);
848 if (!dma_dom)
849 dma_dom = (*iommu)->default_dom;
685 *domain = &dma_dom->domain; 850 *domain = &dma_dom->domain;
686 set_device_domain(*iommu, *domain, *bdf); 851 set_device_domain(*iommu, *domain, *bdf);
687 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 852 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -760,17 +925,24 @@ static dma_addr_t __map_single(struct device *dev,
760 struct dma_ops_domain *dma_dom, 925 struct dma_ops_domain *dma_dom,
761 phys_addr_t paddr, 926 phys_addr_t paddr,
762 size_t size, 927 size_t size,
763 int dir) 928 int dir,
929 bool align,
930 u64 dma_mask)
764{ 931{
765 dma_addr_t offset = paddr & ~PAGE_MASK; 932 dma_addr_t offset = paddr & ~PAGE_MASK;
766 dma_addr_t address, start; 933 dma_addr_t address, start;
767 unsigned int pages; 934 unsigned int pages;
935 unsigned long align_mask = 0;
768 int i; 936 int i;
769 937
770 pages = iommu_num_pages(paddr, size); 938 pages = iommu_num_pages(paddr, size);
771 paddr &= PAGE_MASK; 939 paddr &= PAGE_MASK;
772 940
773 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 941 if (align)
942 align_mask = (1UL << get_order(size)) - 1;
943
944 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
945 dma_mask);
774 if (unlikely(address == bad_dma_address)) 946 if (unlikely(address == bad_dma_address))
775 goto out; 947 goto out;
776 948
@@ -782,6 +954,12 @@ static dma_addr_t __map_single(struct device *dev,
782 } 954 }
783 address += offset; 955 address += offset;
784 956
957 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
958 iommu_flush_tlb(iommu, dma_dom->domain.id);
959 dma_dom->need_flush = false;
960 } else if (unlikely(iommu_has_npcache(iommu)))
961 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
962
785out: 963out:
786 return address; 964 return address;
787} 965}
@@ -812,6 +990,9 @@ static void __unmap_single(struct amd_iommu *iommu,
812 } 990 }
813 991
814 dma_ops_free_addresses(dma_dom, dma_addr, pages); 992 dma_ops_free_addresses(dma_dom, dma_addr, pages);
993
994 if (amd_iommu_unmap_flush)
995 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
815} 996}
816 997
817/* 998/*
@@ -825,6 +1006,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
825 struct protection_domain *domain; 1006 struct protection_domain *domain;
826 u16 devid; 1007 u16 devid;
827 dma_addr_t addr; 1008 dma_addr_t addr;
1009 u64 dma_mask;
1010
1011 if (!check_device(dev))
1012 return bad_dma_address;
1013
1014 dma_mask = *dev->dma_mask;
828 1015
829 get_device_resources(dev, &iommu, &domain, &devid); 1016 get_device_resources(dev, &iommu, &domain, &devid);
830 1017
@@ -833,14 +1020,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
833 return (dma_addr_t)paddr; 1020 return (dma_addr_t)paddr;
834 1021
835 spin_lock_irqsave(&domain->lock, flags); 1022 spin_lock_irqsave(&domain->lock, flags);
836 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1023 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1024 dma_mask);
837 if (addr == bad_dma_address) 1025 if (addr == bad_dma_address)
838 goto out; 1026 goto out;
839 1027
840 if (iommu_has_npcache(iommu)) 1028 if (unlikely(iommu->need_sync))
841 iommu_flush_pages(iommu, domain->id, addr, size);
842
843 if (iommu->need_sync)
844 iommu_completion_wait(iommu); 1029 iommu_completion_wait(iommu);
845 1030
846out: 1031out:
@@ -860,7 +1045,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
860 struct protection_domain *domain; 1045 struct protection_domain *domain;
861 u16 devid; 1046 u16 devid;
862 1047
863 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1048 if (!check_device(dev) ||
1049 !get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */ 1050 /* device not handled by any AMD IOMMU */
865 return; 1051 return;
866 1052
@@ -868,9 +1054,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
868 1054
869 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1055 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
870 1056
871 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1057 if (unlikely(iommu->need_sync))
872
873 if (iommu->need_sync)
874 iommu_completion_wait(iommu); 1058 iommu_completion_wait(iommu);
875 1059
876 spin_unlock_irqrestore(&domain->lock, flags); 1060 spin_unlock_irqrestore(&domain->lock, flags);
@@ -909,6 +1093,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
909 struct scatterlist *s; 1093 struct scatterlist *s;
910 phys_addr_t paddr; 1094 phys_addr_t paddr;
911 int mapped_elems = 0; 1095 int mapped_elems = 0;
1096 u64 dma_mask;
1097
1098 if (!check_device(dev))
1099 return 0;
1100
1101 dma_mask = *dev->dma_mask;
912 1102
913 get_device_resources(dev, &iommu, &domain, &devid); 1103 get_device_resources(dev, &iommu, &domain, &devid);
914 1104
@@ -921,19 +1111,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
921 paddr = sg_phys(s); 1111 paddr = sg_phys(s);
922 1112
923 s->dma_address = __map_single(dev, iommu, domain->priv, 1113 s->dma_address = __map_single(dev, iommu, domain->priv,
924 paddr, s->length, dir); 1114 paddr, s->length, dir, false,
1115 dma_mask);
925 1116
926 if (s->dma_address) { 1117 if (s->dma_address) {
927 s->dma_length = s->length; 1118 s->dma_length = s->length;
928 mapped_elems++; 1119 mapped_elems++;
929 } else 1120 } else
930 goto unmap; 1121 goto unmap;
931 if (iommu_has_npcache(iommu))
932 iommu_flush_pages(iommu, domain->id, s->dma_address,
933 s->dma_length);
934 } 1122 }
935 1123
936 if (iommu->need_sync) 1124 if (unlikely(iommu->need_sync))
937 iommu_completion_wait(iommu); 1125 iommu_completion_wait(iommu);
938 1126
939out: 1127out:
@@ -967,7 +1155,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
967 u16 devid; 1155 u16 devid;
968 int i; 1156 int i;
969 1157
970 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1158 if (!check_device(dev) ||
1159 !get_device_resources(dev, &iommu, &domain, &devid))
971 return; 1160 return;
972 1161
973 spin_lock_irqsave(&domain->lock, flags); 1162 spin_lock_irqsave(&domain->lock, flags);
@@ -975,12 +1164,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
975 for_each_sg(sglist, s, nelems, i) { 1164 for_each_sg(sglist, s, nelems, i) {
976 __unmap_single(iommu, domain->priv, s->dma_address, 1165 __unmap_single(iommu, domain->priv, s->dma_address,
977 s->dma_length, dir); 1166 s->dma_length, dir);
978 iommu_flush_pages(iommu, domain->id, s->dma_address,
979 s->dma_length);
980 s->dma_address = s->dma_length = 0; 1167 s->dma_address = s->dma_length = 0;
981 } 1168 }
982 1169
983 if (iommu->need_sync) 1170 if (unlikely(iommu->need_sync))
984 iommu_completion_wait(iommu); 1171 iommu_completion_wait(iommu);
985 1172
986 spin_unlock_irqrestore(&domain->lock, flags); 1173 spin_unlock_irqrestore(&domain->lock, flags);
@@ -998,25 +1185,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
998 struct protection_domain *domain; 1185 struct protection_domain *domain;
999 u16 devid; 1186 u16 devid;
1000 phys_addr_t paddr; 1187 phys_addr_t paddr;
1188 u64 dma_mask = dev->coherent_dma_mask;
1189
1190 if (!check_device(dev))
1191 return NULL;
1001 1192
1193 if (!get_device_resources(dev, &iommu, &domain, &devid))
1194 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
1195
1196 flag |= __GFP_ZERO;
1002 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1197 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1003 if (!virt_addr) 1198 if (!virt_addr)
1004 return 0; 1199 return 0;
1005 1200
1006 memset(virt_addr, 0, size);
1007 paddr = virt_to_phys(virt_addr); 1201 paddr = virt_to_phys(virt_addr);
1008 1202
1009 get_device_resources(dev, &iommu, &domain, &devid);
1010
1011 if (!iommu || !domain) { 1203 if (!iommu || !domain) {
1012 *dma_addr = (dma_addr_t)paddr; 1204 *dma_addr = (dma_addr_t)paddr;
1013 return virt_addr; 1205 return virt_addr;
1014 } 1206 }
1015 1207
1208 if (!dma_mask)
1209 dma_mask = *dev->dma_mask;
1210
1016 spin_lock_irqsave(&domain->lock, flags); 1211 spin_lock_irqsave(&domain->lock, flags);
1017 1212
1018 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1213 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1019 size, DMA_BIDIRECTIONAL); 1214 size, DMA_BIDIRECTIONAL, true, dma_mask);
1020 1215
1021 if (*dma_addr == bad_dma_address) { 1216 if (*dma_addr == bad_dma_address) {
1022 free_pages((unsigned long)virt_addr, get_order(size)); 1217 free_pages((unsigned long)virt_addr, get_order(size));
@@ -1024,10 +1219,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1024 goto out; 1219 goto out;
1025 } 1220 }
1026 1221
1027 if (iommu_has_npcache(iommu)) 1222 if (unlikely(iommu->need_sync))
1028 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
1029
1030 if (iommu->need_sync)
1031 iommu_completion_wait(iommu); 1223 iommu_completion_wait(iommu);
1032 1224
1033out: 1225out:
@@ -1038,8 +1230,6 @@ out:
1038 1230
1039/* 1231/*
1040 * The exported free_coherent function for dma_ops. 1232 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */ 1233 */
1044static void free_coherent(struct device *dev, size_t size, 1234static void free_coherent(struct device *dev, size_t size,
1045 void *virt_addr, dma_addr_t dma_addr) 1235 void *virt_addr, dma_addr_t dma_addr)
@@ -1049,6 +1239,9 @@ static void free_coherent(struct device *dev, size_t size,
1049 struct protection_domain *domain; 1239 struct protection_domain *domain;
1050 u16 devid; 1240 u16 devid;
1051 1241
1242 if (!check_device(dev))
1243 return;
1244
1052 get_device_resources(dev, &iommu, &domain, &devid); 1245 get_device_resources(dev, &iommu, &domain, &devid);
1053 1246
1054 if (!iommu || !domain) 1247 if (!iommu || !domain)
@@ -1057,9 +1250,8 @@ static void free_coherent(struct device *dev, size_t size,
1057 spin_lock_irqsave(&domain->lock, flags); 1250 spin_lock_irqsave(&domain->lock, flags);
1058 1251
1059 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1252 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1060 iommu_flush_pages(iommu, domain->id, dma_addr, size);
1061 1253
1062 if (iommu->need_sync) 1254 if (unlikely(iommu->need_sync))
1063 iommu_completion_wait(iommu); 1255 iommu_completion_wait(iommu);
1064 1256
1065 spin_unlock_irqrestore(&domain->lock, flags); 1257 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1069,6 +1261,30 @@ free_mem:
1069} 1261}
1070 1262
1071/* 1263/*
1264 * This function is called by the DMA layer to find out if we can handle a
1265 * particular device. It is part of the dma_ops.
1266 */
1267static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1268{
1269 u16 bdf;
1270 struct pci_dev *pcidev;
1271
1272 /* No device or no PCI device */
1273 if (!dev || dev->bus != &pci_bus_type)
1274 return 0;
1275
1276 pcidev = to_pci_dev(dev);
1277
1278 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1279
1280 /* Out of our scope? */
1281 if (bdf > amd_iommu_last_bdf)
1282 return 0;
1283
1284 return 1;
1285}
1286
1287/*
1072 * The function for pre-allocating protection domains. 1288 * The function for pre-allocating protection domains.
1073 * 1289 *
1074 * If the driver core informs the DMA layer if a driver grabs a device 1290 * If the driver core informs the DMA layer if a driver grabs a device
@@ -1097,10 +1313,9 @@ void prealloc_protection_domains(void)
1097 if (!dma_dom) 1313 if (!dma_dom)
1098 continue; 1314 continue;
1099 init_unity_mappings_for_device(dma_dom, devid); 1315 init_unity_mappings_for_device(dma_dom, devid);
1100 set_device_domain(iommu, &dma_dom->domain, devid); 1316 dma_dom->target_dev = devid;
1101 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1317
1102 dma_dom->domain.id); 1318 list_add_tail(&dma_dom->list, &iommu_pd_list);
1103 print_devid(devid, 1);
1104 } 1319 }
1105} 1320}
1106 1321
@@ -1111,6 +1326,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
1111 .unmap_single = unmap_single, 1326 .unmap_single = unmap_single,
1112 .map_sg = map_sg, 1327 .map_sg = map_sg,
1113 .unmap_sg = unmap_sg, 1328 .unmap_sg = unmap_sg,
1329 .dma_supported = amd_iommu_dma_supported,
1114}; 1330};
1115 1331
1116/* 1332/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a69cc0f52042..148fcfe22f17 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define PCI_BUS(x) (((x) >> 8) & 0xff)
34#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
35 36
36#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
121 we find in ACPI */ 122 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */ 124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
124 126
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */ 128 system */
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
234{ 236{
235 u32 ctrl; 237 u32 ctrl;
236 238
237 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
238 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
240} 242}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242/* Function to enable the hardware */ 244/* Function to enable the hardware */
243void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
244{ 246{
245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
246 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
247 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
248 253
249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
250} 255}
251 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
252/* 264/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in 265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one. 266 * the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
286 ****************************************************************************/ 298 ****************************************************************************/
287 299
288/* 300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
289 * This function reads the last device id the IOMMU has to handle from the PCI 309 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU 310 * capability header for this IOMMU
291 */ 311 */
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
329 default: 349 default:
330 break; 350 break;
331 } 351 }
332 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
333 } 353 }
334 354
335 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
414 434
415static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
416{ 436{
417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); 437 free_pages((unsigned long)iommu->cmd_buf,
438 get_order(iommu->cmd_buf_size));
439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
418} 463}
419 464
420/* sets a specific bit in the device table entry. */ 465/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
487 */ 532 */
488static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
489{ 534{
490 int bus = PCI_BUS(iommu->devid);
491 int dev = PCI_SLOT(iommu->devid);
492 int fn = PCI_FUNC(iommu->devid);
493 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
494 u32 range; 536 u32 range, misc;
495 537
496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
539 &iommu->cap);
540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
541 &range);
542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
497 544
498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
499 iommu->first_device = calc_devid(MMIO_GET_BUS(range), 545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
500 MMIO_GET_FD(range)); 546 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range), 547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range)); 548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
503} 550}
504 551
505/* 552/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
604 break; 651 break;
605 } 652 }
606 653
607 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
608 } 655 }
609} 656}
610 657
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
622static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
623{ 670{
624 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
625 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
626} 674}
627 675
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
649 /* 697 /*
650 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
651 */ 699 */
652 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
653 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
654 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
655 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
656 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -661,10 +713,18 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
661 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
662 return -ENOMEM; 714 return -ENOMEM;
663 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
664 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
665 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
666 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
667 725
726 pci_enable_device(iommu->dev);
727
668 return 0; 728 return 0;
669} 729}
670 730
@@ -706,6 +766,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
706 766
707/**************************************************************************** 767/****************************************************************************
708 * 768 *
769 * The following functions initialize the MSI interrupts for all IOMMUs
770 * in the system. Its a bit challenging because there could be multiple
771 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
772 * pci_dev.
773 *
774 ****************************************************************************/
775
776static int __init iommu_setup_msix(struct amd_iommu *iommu)
777{
778 struct amd_iommu *curr;
779 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
780 int nvec = 0, i;
781
782 list_for_each_entry(curr, &amd_iommu_list, list) {
783 if (curr->dev == iommu->dev) {
784 entries[nvec].entry = curr->evt_msi_num;
785 entries[nvec].vector = 0;
786 curr->int_enabled = true;
787 nvec++;
788 }
789 }
790
791 if (pci_enable_msix(iommu->dev, entries, nvec)) {
792 pci_disable_msix(iommu->dev);
793 return 1;
794 }
795
796 for (i = 0; i < nvec; ++i) {
797 int r = request_irq(entries->vector, amd_iommu_int_handler,
798 IRQF_SAMPLE_RANDOM,
799 "AMD IOMMU",
800 NULL);
801 if (r)
802 goto out_free;
803 }
804
805 return 0;
806
807out_free:
808 for (i -= 1; i >= 0; --i)
809 free_irq(entries->vector, NULL);
810
811 pci_disable_msix(iommu->dev);
812
813 return 1;
814}
815
816static int __init iommu_setup_msi(struct amd_iommu *iommu)
817{
818 int r;
819 struct amd_iommu *curr;
820
821 list_for_each_entry(curr, &amd_iommu_list, list) {
822 if (curr->dev == iommu->dev)
823 curr->int_enabled = true;
824 }
825
826
827 if (pci_enable_msi(iommu->dev))
828 return 1;
829
830 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
831 IRQF_SAMPLE_RANDOM,
832 "AMD IOMMU",
833 NULL);
834
835 if (r) {
836 pci_disable_msi(iommu->dev);
837 return 1;
838 }
839
840 return 0;
841}
842
843static int __init iommu_init_msi(struct amd_iommu *iommu)
844{
845 if (iommu->int_enabled)
846 return 0;
847
848 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
849 return iommu_setup_msix(iommu);
850 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
851 return iommu_setup_msi(iommu);
852
853 return 1;
854}
855
856/****************************************************************************
857 *
709 * The next functions belong to the third pass of parsing the ACPI 858 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are 859 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges). 860 * gathered (like exclusion and unity mapping reanges).
@@ -811,7 +960,6 @@ static void init_device_table(void)
811 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { 960 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
812 set_dev_entry_bit(devid, DEV_ENTRY_VALID); 961 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
813 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); 962 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
814 set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
815 } 963 }
816} 964}
817 965
@@ -825,6 +973,8 @@ static void __init enable_iommus(void)
825 973
826 list_for_each_entry(iommu, &amd_iommu_list, list) { 974 list_for_each_entry(iommu, &amd_iommu_list, list) {
827 iommu_set_exclusion_range(iommu); 975 iommu_set_exclusion_range(iommu);
976 iommu_init_msi(iommu);
977 iommu_enable_event_logging(iommu);
828 iommu_enable(iommu); 978 iommu_enable(iommu);
829 } 979 }
830} 980}
@@ -995,11 +1145,17 @@ int __init amd_iommu_init(void)
995 else 1145 else
996 printk("disabled\n"); 1146 printk("disabled\n");
997 1147
1148 if (amd_iommu_unmap_flush)
1149 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1150 else
1151 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1152
998out: 1153out:
999 return ret; 1154 return ret;
1000 1155
1001free: 1156free:
1002 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1157 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1158 get_order(MAX_DOMAIN_ID/8));
1003 1159
1004 free_pages((unsigned long)amd_iommu_pd_table, 1160 free_pages((unsigned long)amd_iommu_pd_table,
1005 get_order(rlookup_table_size)); 1161 get_order(rlookup_table_size));
@@ -1057,8 +1213,10 @@ void __init amd_iommu_detect(void)
1057static int __init parse_amd_iommu_options(char *str) 1213static int __init parse_amd_iommu_options(char *str)
1058{ 1214{
1059 for (; *str; ++str) { 1215 for (; *str; ++str) {
1060 if (strcmp(str, "isolate") == 0) 1216 if (strncmp(str, "isolate", 7) == 0)
1061 amd_iommu_isolate = 1; 1217 amd_iommu_isolate = 1;
1218 if (strncmp(str, "fullflush", 11) == 0)
1219 amd_iommu_unmap_flush = true;
1062 } 1220 }
1063 1221
1064 return 1; 1222 return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db11..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
455 force_iommu || 455 force_iommu ||
456 valid_agp || 456 valid_agp ||
457 fallback_aper_force) { 457 fallback_aper_force) {
458 printk(KERN_ERR 458 printk(KERN_INFO
459 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
460 printk(KERN_ERR 460 printk(KERN_INFO
461 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
462 printk(KERN_ERR 462 printk(KERN_INFO
463 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
464 32 << fallback_aper_order); 464 32 << fallback_aper_order);
465 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 584272105051..a91c57cb666a 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -60,10 +60,8 @@ unsigned long mp_lapic_addr;
60static int force_enable_local_apic; 60static int force_enable_local_apic;
61int disable_apic; 61int disable_apic;
62 62
63/* Local APIC timer verification ok */
64static int local_apic_timer_verify_ok;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 63/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 64static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 65/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 66int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 67EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -130,7 +128,11 @@ static inline int lapic_get_version(void)
130 */ 128 */
131static inline int lapic_is_integrated(void) 129static inline int lapic_is_integrated(void)
132{ 130{
131#ifdef CONFIG_X86_64
132 return 1;
133#else
133 return APIC_INTEGRATED(lapic_get_version()); 134 return APIC_INTEGRATED(lapic_get_version());
135#endif
134} 136}
135 137
136/* 138/*
@@ -244,8 +246,12 @@ int lapic_get_maxlvt(void)
244 * Local APIC timer 246 * Local APIC timer
245 */ 247 */
246 248
247/* Clock divisor is set to 16 */ 249/* Clock divisor */
250#ifdef CONFG_X86_64
251#define APIC_DIVISOR 1
252#else
248#define APIC_DIVISOR 16 253#define APIC_DIVISOR 16
254#endif
249 255
250/* 256/*
251 * This function sets up the local APIC timer, with a timeout of 257 * This function sets up the local APIC timer, with a timeout of
@@ -253,6 +259,9 @@ int lapic_get_maxlvt(void)
253 * this function twice on the boot CPU, once with a bogus timeout 259 * this function twice on the boot CPU, once with a bogus timeout
254 * value, second time for real. The other (noncalibrating) CPUs 260 * value, second time for real. The other (noncalibrating) CPUs
255 * call this function only once, with the real, calibrated value. 261 * call this function only once, with the real, calibrated value.
262 *
263 * We do reads before writes even if unnecessary, to get around the
264 * P5 APIC double write bug.
256 */ 265 */
257static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 266static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
258{ 267{
@@ -274,14 +283,44 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
274 */ 283 */
275 tmp_value = apic_read(APIC_TDCR); 284 tmp_value = apic_read(APIC_TDCR);
276 apic_write(APIC_TDCR, 285 apic_write(APIC_TDCR,
277 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 286 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
278 APIC_TDR_DIV_16); 287 APIC_TDR_DIV_16);
279 288
280 if (!oneshot) 289 if (!oneshot)
281 apic_write(APIC_TMICT, clocks / APIC_DIVISOR); 290 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
282} 291}
283 292
284/* 293/*
294 * Setup extended LVT, AMD specific (K8, family 10h)
295 *
296 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
297 * MCE interrupts are supported. Thus MCE offset must be set to 0.
298 */
299
300#define APIC_EILVT_LVTOFF_MCE 0
301#define APIC_EILVT_LVTOFF_IBS 1
302
303static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
304{
305 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
306 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
307
308 apic_write(reg, v);
309}
310
311u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
312{
313 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
314 return APIC_EILVT_LVTOFF_MCE;
315}
316
317u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
318{
319 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
320 return APIC_EILVT_LVTOFF_IBS;
321}
322
323/*
285 * Program the next event, relative to now 324 * Program the next event, relative to now
286 */ 325 */
287static int lapic_next_event(unsigned long delta, 326static int lapic_next_event(unsigned long delta,
@@ -300,8 +339,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
300 unsigned long flags; 339 unsigned long flags;
301 unsigned int v; 340 unsigned int v;
302 341
303 /* Lapic used for broadcast ? */ 342 /* Lapic used as dummy for broadcast ? */
304 if (!local_apic_timer_verify_ok) 343 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
305 return; 344 return;
306 345
307 local_irq_save(flags); 346 local_irq_save(flags);
@@ -514,7 +553,7 @@ static int __init calibrate_APIC_clock(void)
514 return -1; 553 return -1;
515 } 554 }
516 555
517 local_apic_timer_verify_ok = 1; 556 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
518 557
519 /* We trust the pm timer based calibration */ 558 /* We trust the pm timer based calibration */
520 if (!pm_referenced) { 559 if (!pm_referenced) {
@@ -548,11 +587,11 @@ static int __init calibrate_APIC_clock(void)
548 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 587 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
549 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 588 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
550 else 589 else
551 local_apic_timer_verify_ok = 0; 590 levt->features |= CLOCK_EVT_FEAT_DUMMY;
552 } else 591 } else
553 local_irq_enable(); 592 local_irq_enable();
554 593
555 if (!local_apic_timer_verify_ok) { 594 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
556 printk(KERN_WARNING 595 printk(KERN_WARNING
557 "APIC timer disabled due to verification failure.\n"); 596 "APIC timer disabled due to verification failure.\n");
558 return -1; 597 return -1;
@@ -574,7 +613,8 @@ void __init setup_boot_APIC_clock(void)
574 * timer as a dummy clock event source on SMP systems, so the 613 * timer as a dummy clock event source on SMP systems, so the
575 * broadcast mechanism is used. On UP systems simply ignore it. 614 * broadcast mechanism is used. On UP systems simply ignore it.
576 */ 615 */
577 if (local_apic_timer_disabled) { 616 if (disable_apic_timer) {
617 printk(KERN_INFO "Disabling APIC timer\n");
578 /* No broadcast on UP ! */ 618 /* No broadcast on UP ! */
579 if (num_possible_cpus() > 1) { 619 if (num_possible_cpus() > 1) {
580 lapic_clockevent.mult = 1; 620 lapic_clockevent.mult = 1;
@@ -643,7 +683,11 @@ static void local_apic_timer_interrupt(void)
643 /* 683 /*
644 * the NMI deadlock-detector uses this. 684 * the NMI deadlock-detector uses this.
645 */ 685 */
686#ifdef CONFIG_X86_64
687 add_pda(apic_timer_irqs, 1);
688#else
646 per_cpu(irq_stat, cpu).apic_timer_irqs++; 689 per_cpu(irq_stat, cpu).apic_timer_irqs++;
690#endif
647 691
648 evt->event_handler(evt); 692 evt->event_handler(evt);
649} 693}
@@ -683,35 +727,6 @@ int setup_profiling_timer(unsigned int multiplier)
683} 727}
684 728
685/* 729/*
686 * Setup extended LVT, AMD specific (K8, family 10h)
687 *
688 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
689 * MCE interrupts are supported. Thus MCE offset must be set to 0.
690 */
691
692#define APIC_EILVT_LVTOFF_MCE 0
693#define APIC_EILVT_LVTOFF_IBS 1
694
695static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
696{
697 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
698 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
699 apic_write(reg, v);
700}
701
702u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
703{
704 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
705 return APIC_EILVT_LVTOFF_MCE;
706}
707
708u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
709{
710 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
711 return APIC_EILVT_LVTOFF_IBS;
712}
713
714/*
715 * Local APIC start and shutdown 730 * Local APIC start and shutdown
716 */ 731 */
717 732
@@ -756,7 +771,7 @@ void clear_local_APIC(void)
756 } 771 }
757 772
758 /* lets not touch this if we didn't frob it */ 773 /* lets not touch this if we didn't frob it */
759#ifdef CONFIG_X86_MCE_P4THERMAL 774#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
760 if (maxlvt >= 5) { 775 if (maxlvt >= 5) {
761 v = apic_read(APIC_LVTTHMR); 776 v = apic_read(APIC_LVTTHMR);
762 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 777 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -773,10 +788,6 @@ void clear_local_APIC(void)
773 if (maxlvt >= 4) 788 if (maxlvt >= 4)
774 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 789 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
775 790
776#ifdef CONFIG_X86_MCE_P4THERMAL
777 if (maxlvt >= 5)
778 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
779#endif
780 /* Integrated APIC (!82489DX) ? */ 791 /* Integrated APIC (!82489DX) ? */
781 if (lapic_is_integrated()) { 792 if (lapic_is_integrated()) {
782 if (maxlvt > 3) 793 if (maxlvt > 3)
@@ -791,7 +802,7 @@ void clear_local_APIC(void)
791 */ 802 */
792void disable_local_APIC(void) 803void disable_local_APIC(void)
793{ 804{
794 unsigned long value; 805 unsigned int value;
795 806
796 clear_local_APIC(); 807 clear_local_APIC();
797 808
@@ -803,6 +814,7 @@ void disable_local_APIC(void)
803 value &= ~APIC_SPIV_APIC_ENABLED; 814 value &= ~APIC_SPIV_APIC_ENABLED;
804 apic_write(APIC_SPIV, value); 815 apic_write(APIC_SPIV, value);
805 816
817#ifdef CONFIG_X86_32
806 /* 818 /*
807 * When LAPIC was disabled by the BIOS and enabled by the kernel, 819 * When LAPIC was disabled by the BIOS and enabled by the kernel,
808 * restore the disabled state. 820 * restore the disabled state.
@@ -814,6 +826,7 @@ void disable_local_APIC(void)
814 l &= ~MSR_IA32_APICBASE_ENABLE; 826 l &= ~MSR_IA32_APICBASE_ENABLE;
815 wrmsr(MSR_IA32_APICBASE, l, h); 827 wrmsr(MSR_IA32_APICBASE, l, h);
816 } 828 }
829#endif
817} 830}
818 831
819/* 832/*
@@ -830,11 +843,15 @@ void lapic_shutdown(void)
830 return; 843 return;
831 844
832 local_irq_save(flags); 845 local_irq_save(flags);
833 clear_local_APIC();
834 846
835 if (enabled_via_apicbase) 847#ifdef CONFIG_X86_32
848 if (!enabled_via_apicbase)
849 clear_local_APIC();
850 else
851#endif
836 disable_local_APIC(); 852 disable_local_APIC();
837 853
854
838 local_irq_restore(flags); 855 local_irq_restore(flags);
839} 856}
840 857
@@ -879,6 +896,12 @@ int __init verify_local_APIC(void)
879 */ 896 */
880 reg0 = apic_read(APIC_ID); 897 reg0 = apic_read(APIC_ID);
881 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 898 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
899 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
900 reg1 = apic_read(APIC_ID);
901 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
902 apic_write(APIC_ID, reg0);
903 if (reg1 != (reg0 ^ APIC_ID_MASK))
904 return 0;
882 905
883 /* 906 /*
884 * The next two are just to see if we have sane values. 907 * The next two are just to see if we have sane values.
@@ -904,14 +927,15 @@ void __init sync_Arb_IDs(void)
904 */ 927 */
905 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 928 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
906 return; 929 return;
930
907 /* 931 /*
908 * Wait for idle. 932 * Wait for idle.
909 */ 933 */
910 apic_wait_icr_idle(); 934 apic_wait_icr_idle();
911 935
912 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 936 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
913 apic_write(APIC_ICR, 937 apic_write(APIC_ICR, APIC_DEST_ALLINC |
914 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); 938 APIC_INT_LEVELTRIG | APIC_DM_INIT);
915} 939}
916 940
917/* 941/*
@@ -919,7 +943,7 @@ void __init sync_Arb_IDs(void)
919 */ 943 */
920void __init init_bsp_APIC(void) 944void __init init_bsp_APIC(void)
921{ 945{
922 unsigned long value; 946 unsigned int value;
923 947
924 /* 948 /*
925 * Don't do the setup now if we have a SMP BIOS as the 949 * Don't do the setup now if we have a SMP BIOS as the
@@ -940,11 +964,13 @@ void __init init_bsp_APIC(void)
940 value &= ~APIC_VECTOR_MASK; 964 value &= ~APIC_VECTOR_MASK;
941 value |= APIC_SPIV_APIC_ENABLED; 965 value |= APIC_SPIV_APIC_ENABLED;
942 966
967#ifdef CONFIG_X86_32
943 /* This bit is reserved on P4/Xeon and should be cleared */ 968 /* This bit is reserved on P4/Xeon and should be cleared */
944 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 969 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
945 (boot_cpu_data.x86 == 15)) 970 (boot_cpu_data.x86 == 15))
946 value &= ~APIC_SPIV_FOCUS_DISABLED; 971 value &= ~APIC_SPIV_FOCUS_DISABLED;
947 else 972 else
973#endif
948 value |= APIC_SPIV_FOCUS_DISABLED; 974 value |= APIC_SPIV_FOCUS_DISABLED;
949 value |= SPURIOUS_APIC_VECTOR; 975 value |= SPURIOUS_APIC_VECTOR;
950 apic_write(APIC_SPIV, value); 976 apic_write(APIC_SPIV, value);
@@ -963,6 +989,16 @@ static void __cpuinit lapic_setup_esr(void)
963{ 989{
964 unsigned long oldvalue, value, maxlvt; 990 unsigned long oldvalue, value, maxlvt;
965 if (lapic_is_integrated() && !esr_disable) { 991 if (lapic_is_integrated() && !esr_disable) {
992 if (esr_disable) {
993 /*
994 * Something untraceable is creating bad interrupts on
995 * secondary quads ... for the moment, just leave the
996 * ESR disabled - we can't do anything useful with the
997 * errors anyway - mbligh
998 */
999 printk(KERN_INFO "Leaving ESR disabled.\n");
1000 return;
1001 }
966 /* !82489DX */ 1002 /* !82489DX */
967 maxlvt = lapic_get_maxlvt(); 1003 maxlvt = lapic_get_maxlvt();
968 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1004 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
@@ -983,16 +1019,7 @@ static void __cpuinit lapic_setup_esr(void)
983 "vector: 0x%08lx after: 0x%08lx\n", 1019 "vector: 0x%08lx after: 0x%08lx\n",
984 oldvalue, value); 1020 oldvalue, value);
985 } else { 1021 } else {
986 if (esr_disable) 1022 printk(KERN_INFO "No ESR for 82489DX.\n");
987 /*
988 * Something untraceable is creating bad interrupts on
989 * secondary quads ... for the moment, just leave the
990 * ESR disabled - we can't do anything useful with the
991 * errors anyway - mbligh
992 */
993 printk(KERN_INFO "Leaving ESR disabled.\n");
994 else
995 printk(KERN_INFO "No ESR for 82489DX.\n");
996 } 1023 }
997} 1024}
998 1025
@@ -1130,13 +1157,17 @@ void __cpuinit setup_local_APIC(void)
1130 1157
1131void __cpuinit end_local_APIC_setup(void) 1158void __cpuinit end_local_APIC_setup(void)
1132{ 1159{
1133 unsigned long value;
1134
1135 lapic_setup_esr(); 1160 lapic_setup_esr();
1136 /* Disable the local apic timer */ 1161
1137 value = apic_read(APIC_LVTT); 1162#ifdef CONFIG_X86_32
1138 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1163 {
1139 apic_write(APIC_LVTT, value); 1164 unsigned int value;
1165 /* Disable the local apic timer */
1166 value = apic_read(APIC_LVTT);
1167 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1168 apic_write(APIC_LVTT, value);
1169 }
1170#endif
1140 1171
1141 setup_apic_nmi_watchdog(NULL); 1172 setup_apic_nmi_watchdog(NULL);
1142 apic_pm_activate(); 1173 apic_pm_activate();
@@ -1367,6 +1398,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1367 */ 1398 */
1368void __init connect_bsp_APIC(void) 1399void __init connect_bsp_APIC(void)
1369{ 1400{
1401#ifdef CONFIG_X86_32
1370 if (pic_mode) { 1402 if (pic_mode) {
1371 /* 1403 /*
1372 * Do not trust the local APIC being empty at bootup. 1404 * Do not trust the local APIC being empty at bootup.
@@ -1381,6 +1413,7 @@ void __init connect_bsp_APIC(void)
1381 outb(0x70, 0x22); 1413 outb(0x70, 0x22);
1382 outb(0x01, 0x23); 1414 outb(0x01, 0x23);
1383 } 1415 }
1416#endif
1384 enable_apic_mode(); 1417 enable_apic_mode();
1385} 1418}
1386 1419
@@ -1393,6 +1426,9 @@ void __init connect_bsp_APIC(void)
1393 */ 1426 */
1394void disconnect_bsp_APIC(int virt_wire_setup) 1427void disconnect_bsp_APIC(int virt_wire_setup)
1395{ 1428{
1429 unsigned int value;
1430
1431#ifdef CONFIG_X86_32
1396 if (pic_mode) { 1432 if (pic_mode) {
1397 /* 1433 /*
1398 * Put the board back into PIC mode (has an effect only on 1434 * Put the board back into PIC mode (has an effect only on
@@ -1404,54 +1440,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1404 "entering PIC mode.\n"); 1440 "entering PIC mode.\n");
1405 outb(0x70, 0x22); 1441 outb(0x70, 0x22);
1406 outb(0x00, 0x23); 1442 outb(0x00, 0x23);
1407 } else { 1443 return;
1408 /* Go back to Virtual Wire compatibility mode */ 1444 }
1409 unsigned long value; 1445#endif
1410 1446
1411 /* For the spurious interrupt use vector F, and enable it */ 1447 /* Go back to Virtual Wire compatibility mode */
1412 value = apic_read(APIC_SPIV);
1413 value &= ~APIC_VECTOR_MASK;
1414 value |= APIC_SPIV_APIC_ENABLED;
1415 value |= 0xf;
1416 apic_write(APIC_SPIV, value);
1417 1448
1418 if (!virt_wire_setup) { 1449 /* For the spurious interrupt use vector F, and enable it */
1419 /* 1450 value = apic_read(APIC_SPIV);
1420 * For LVT0 make it edge triggered, active high, 1451 value &= ~APIC_VECTOR_MASK;
1421 * external and enabled 1452 value |= APIC_SPIV_APIC_ENABLED;
1422 */ 1453 value |= 0xf;
1423 value = apic_read(APIC_LVT0); 1454 apic_write(APIC_SPIV, value);
1424 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1425 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1426 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1427 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1428 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1429 apic_write(APIC_LVT0, value);
1430 } else {
1431 /* Disable LVT0 */
1432 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1433 }
1434 1455
1456 if (!virt_wire_setup) {
1435 /* 1457 /*
1436 * For LVT1 make it edge triggered, active high, nmi and 1458 * For LVT0 make it edge triggered, active high,
1437 * enabled 1459 * external and enabled
1438 */ 1460 */
1439 value = apic_read(APIC_LVT1); 1461 value = apic_read(APIC_LVT0);
1440 value &= ~( 1462 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1441 APIC_MODE_MASK | APIC_SEND_PENDING |
1442 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1463 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1443 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1464 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1444 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1465 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1445 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1466 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1446 apic_write(APIC_LVT1, value); 1467 apic_write(APIC_LVT0, value);
1468 } else {
1469 /* Disable LVT0 */
1470 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1447 } 1471 }
1472
1473 /*
1474 * For LVT1 make it edge triggered, active high,
1475 * nmi and enabled
1476 */
1477 value = apic_read(APIC_LVT1);
1478 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1479 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1480 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1481 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1482 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1483 apic_write(APIC_LVT1, value);
1448} 1484}
1449 1485
1450void __cpuinit generic_processor_info(int apicid, int version) 1486void __cpuinit generic_processor_info(int apicid, int version)
1451{ 1487{
1452 int cpu; 1488 int cpu;
1453 cpumask_t tmp_map; 1489 cpumask_t tmp_map;
1454 physid_mask_t phys_cpu;
1455 1490
1456 /* 1491 /*
1457 * Validate version 1492 * Validate version
@@ -1464,9 +1499,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1464 } 1499 }
1465 apic_version[apicid] = version; 1500 apic_version[apicid] = version;
1466 1501
1467 phys_cpu = apicid_to_cpu_present(apicid);
1468 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1469
1470 if (num_processors >= NR_CPUS) { 1502 if (num_processors >= NR_CPUS) {
1471 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1503 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1472 " Processor ignored.\n", NR_CPUS); 1504 " Processor ignored.\n", NR_CPUS);
@@ -1477,17 +1509,19 @@ void __cpuinit generic_processor_info(int apicid, int version)
1477 cpus_complement(tmp_map, cpu_present_map); 1509 cpus_complement(tmp_map, cpu_present_map);
1478 cpu = first_cpu(tmp_map); 1510 cpu = first_cpu(tmp_map);
1479 1511
1480 if (apicid == boot_cpu_physical_apicid) 1512 physid_set(apicid, phys_cpu_present_map);
1513 if (apicid == boot_cpu_physical_apicid) {
1481 /* 1514 /*
1482 * x86_bios_cpu_apicid is required to have processors listed 1515 * x86_bios_cpu_apicid is required to have processors listed
1483 * in same order as logical cpu numbers. Hence the first 1516 * in same order as logical cpu numbers. Hence the first
1484 * entry is BSP, and so on. 1517 * entry is BSP, and so on.
1485 */ 1518 */
1486 cpu = 0; 1519 cpu = 0;
1487 1520 }
1488 if (apicid > max_physical_apicid) 1521 if (apicid > max_physical_apicid)
1489 max_physical_apicid = apicid; 1522 max_physical_apicid = apicid;
1490 1523
1524#ifdef CONFIG_X86_32
1491 /* 1525 /*
1492 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1526 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1493 * but we need to work other dependencies like SMP_SUSPEND etc 1527 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1507,7 +1541,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1507 def_to_bigsmp = 1; 1541 def_to_bigsmp = 1;
1508 } 1542 }
1509 } 1543 }
1510#ifdef CONFIG_SMP 1544#endif
1545
1546#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1511 /* are we being called early in kernel startup? */ 1547 /* are we being called early in kernel startup? */
1512 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1548 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1513 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1549 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1520,6 +1556,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1520 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1556 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1521 } 1557 }
1522#endif 1558#endif
1559
1523 cpu_set(cpu, cpu_possible_map); 1560 cpu_set(cpu, cpu_possible_map);
1524 cpu_set(cpu, cpu_present_map); 1561 cpu_set(cpu, cpu_present_map);
1525} 1562}
@@ -1530,6 +1567,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
1530#ifdef CONFIG_PM 1567#ifdef CONFIG_PM
1531 1568
1532static struct { 1569static struct {
1570 /*
1571 * 'active' is true if the local APIC was enabled by us and
1572 * not the BIOS; this signifies that we are also responsible
1573 * for disabling it before entering apm/acpi suspend
1574 */
1533 int active; 1575 int active;
1534 /* r/w apic fields */ 1576 /* r/w apic fields */
1535 unsigned int apic_id; 1577 unsigned int apic_id;
@@ -1570,7 +1612,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1570 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1612 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1571 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1613 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1572 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1614 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1573#ifdef CONFIG_X86_MCE_P4THERMAL 1615#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1574 if (maxlvt >= 5) 1616 if (maxlvt >= 5)
1575 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1617 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1576#endif 1618#endif
@@ -1594,16 +1636,23 @@ static int lapic_resume(struct sys_device *dev)
1594 1636
1595 local_irq_save(flags); 1637 local_irq_save(flags);
1596 1638
1597 /* 1639#ifdef CONFIG_X86_64
1598 * Make sure the APICBASE points to the right address 1640 if (x2apic)
1599 * 1641 enable_x2apic();
1600 * FIXME! This will be wrong if we ever support suspend on 1642 else
1601 * SMP! We'll need to do this as part of the CPU restore! 1643#endif
1602 */ 1644 {
1603 rdmsr(MSR_IA32_APICBASE, l, h); 1645 /*
1604 l &= ~MSR_IA32_APICBASE_BASE; 1646 * Make sure the APICBASE points to the right address
1605 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1647 *
1606 wrmsr(MSR_IA32_APICBASE, l, h); 1648 * FIXME! This will be wrong if we ever support suspend on
1649 * SMP! We'll need to do this as part of the CPU restore!
1650 */
1651 rdmsr(MSR_IA32_APICBASE, l, h);
1652 l &= ~MSR_IA32_APICBASE_BASE;
1653 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1654 wrmsr(MSR_IA32_APICBASE, l, h);
1655 }
1607 1656
1608 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1657 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1609 apic_write(APIC_ID, apic_pm_state.apic_id); 1658 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1613,7 +1662,7 @@ static int lapic_resume(struct sys_device *dev)
1613 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1662 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1614 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1663 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1615 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1664 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1616#ifdef CONFIG_X86_MCE_P4THERMAL 1665#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1617 if (maxlvt >= 5) 1666 if (maxlvt >= 5)
1618 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1667 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1619#endif 1668#endif
@@ -1627,7 +1676,9 @@ static int lapic_resume(struct sys_device *dev)
1627 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1676 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1628 apic_write(APIC_ESR, 0); 1677 apic_write(APIC_ESR, 0);
1629 apic_read(APIC_ESR); 1678 apic_read(APIC_ESR);
1679
1630 local_irq_restore(flags); 1680 local_irq_restore(flags);
1681
1631 return 0; 1682 return 0;
1632} 1683}
1633 1684
@@ -1683,20 +1734,20 @@ static int __init parse_lapic(char *arg)
1683} 1734}
1684early_param("lapic", parse_lapic); 1735early_param("lapic", parse_lapic);
1685 1736
1686static int __init parse_nolapic(char *arg) 1737static int __init setup_disableapic(char *arg)
1687{ 1738{
1688 disable_apic = 1; 1739 disable_apic = 1;
1689 setup_clear_cpu_cap(X86_FEATURE_APIC); 1740 setup_clear_cpu_cap(X86_FEATURE_APIC);
1690 return 0; 1741 return 0;
1691} 1742}
1692early_param("nolapic", parse_nolapic); 1743early_param("disableapic", setup_disableapic);
1693 1744
1694static int __init parse_disable_lapic_timer(char *arg) 1745/* same as disableapic, for compatibility */
1746static int __init setup_nolapic(char *arg)
1695{ 1747{
1696 local_apic_timer_disabled = 1; 1748 return setup_disableapic(arg);
1697 return 0;
1698} 1749}
1699early_param("nolapic_timer", parse_disable_lapic_timer); 1750early_param("nolapic", setup_nolapic);
1700 1751
1701static int __init parse_lapic_timer_c2_ok(char *arg) 1752static int __init parse_lapic_timer_c2_ok(char *arg)
1702{ 1753{
@@ -1705,15 +1756,40 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1705} 1756}
1706early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1757early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1707 1758
1759static int __init parse_disable_apic_timer(char *arg)
1760{
1761 disable_apic_timer = 1;
1762 return 0;
1763}
1764early_param("noapictimer", parse_disable_apic_timer);
1765
1766static int __init parse_nolapic_timer(char *arg)
1767{
1768 disable_apic_timer = 1;
1769 return 0;
1770}
1771early_param("nolapic_timer", parse_nolapic_timer);
1772
1708static int __init apic_set_verbosity(char *arg) 1773static int __init apic_set_verbosity(char *arg)
1709{ 1774{
1710 if (!arg) 1775 if (!arg) {
1776#ifdef CONFIG_X86_64
1777 skip_ioapic_setup = 0;
1778 ioapic_force = 1;
1779 return 0;
1780#endif
1711 return -EINVAL; 1781 return -EINVAL;
1782 }
1712 1783
1713 if (strcmp(arg, "debug") == 0) 1784 if (strcmp("debug", arg) == 0)
1714 apic_verbosity = APIC_DEBUG; 1785 apic_verbosity = APIC_DEBUG;
1715 else if (strcmp(arg, "verbose") == 0) 1786 else if (strcmp("verbose", arg) == 0)
1716 apic_verbosity = APIC_VERBOSE; 1787 apic_verbosity = APIC_VERBOSE;
1788 else {
1789 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1790 " use apic=verbose or apic=debug\n", arg);
1791 return -EINVAL;
1792 }
1717 1793
1718 return 0; 1794 return 0;
1719} 1795}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 1a6011855af3..53898b65a6ae 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -45,6 +45,7 @@
45#include <mach_ipi.h> 45#include <mach_ipi.h>
46#include <mach_apic.h> 46#include <mach_apic.h>
47 47
48/* Disable local APIC timer from the kernel commandline or via dmi quirk */
48static int disable_apic_timer __cpuinitdata; 49static int disable_apic_timer __cpuinitdata;
49static int apic_calibrate_pmtmr __initdata; 50static int apic_calibrate_pmtmr __initdata;
50int disable_apic; 51int disable_apic;
@@ -80,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode,
80static void lapic_timer_broadcast(cpumask_t mask); 81static void lapic_timer_broadcast(cpumask_t mask);
81static void apic_pm_activate(void); 82static void apic_pm_activate(void);
82 83
84/*
85 * The local apic timer can be used for any function which is CPU local.
86 */
83static struct clock_event_device lapic_clockevent = { 87static struct clock_event_device lapic_clockevent = {
84 .name = "lapic", 88 .name = "lapic",
85 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT 89 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
@@ -106,11 +110,15 @@ static inline int lapic_get_version(void)
106} 110}
107 111
108/* 112/*
109 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
110 */ 114 */
111static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
112{ 116{
117#ifdef CONFIG_X86_64
113 return 1; 118 return 1;
119#else
120 return APIC_INTEGRATED(lapic_get_version());
121#endif
114} 122}
115 123
116/* 124/*
@@ -125,6 +133,11 @@ static int modern_apic(void)
125 return lapic_get_version() >= 0x14; 133 return lapic_get_version() >= 0x14;
126} 134}
127 135
136/*
137 * Paravirt kernels also might be using these below ops. So we still
138 * use generic apic_read()/apic_write(), which might be pointing to different
139 * ops in PARAVIRT case.
140 */
128void xapic_wait_icr_idle(void) 141void xapic_wait_icr_idle(void)
129{ 142{
130 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 143 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -149,7 +162,7 @@ u32 safe_xapic_wait_icr_idle(void)
149 162
150void xapic_icr_write(u32 low, u32 id) 163void xapic_icr_write(u32 low, u32 id)
151{ 164{
152 apic_write(APIC_ICR2, id << 24); 165 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
153 apic_write(APIC_ICR, low); 166 apic_write(APIC_ICR, low);
154} 167}
155 168
@@ -160,7 +173,7 @@ u64 xapic_icr_read(void)
160 icr2 = apic_read(APIC_ICR2); 173 icr2 = apic_read(APIC_ICR2);
161 icr1 = apic_read(APIC_ICR); 174 icr1 = apic_read(APIC_ICR);
162 175
163 return (icr1 | ((u64)icr2 << 32)); 176 return icr1 | ((u64)icr2 << 32);
164} 177}
165 178
166static struct apic_ops xapic_ops = { 179static struct apic_ops xapic_ops = {
@@ -173,7 +186,6 @@ static struct apic_ops xapic_ops = {
173}; 186};
174 187
175struct apic_ops __read_mostly *apic_ops = &xapic_ops; 188struct apic_ops __read_mostly *apic_ops = &xapic_ops;
176
177EXPORT_SYMBOL_GPL(apic_ops); 189EXPORT_SYMBOL_GPL(apic_ops);
178 190
179static void x2apic_wait_icr_idle(void) 191static void x2apic_wait_icr_idle(void)
@@ -243,6 +255,17 @@ int lapic_get_maxlvt(void)
243} 255}
244 256
245/* 257/*
258 * Local APIC timer
259 */
260
261/* Clock divisor */
262#ifdef CONFG_X86_64
263#define APIC_DIVISOR 1
264#else
265#define APIC_DIVISOR 16
266#endif
267
268/*
246 * This function sets up the local APIC timer, with a timeout of 269 * This function sets up the local APIC timer, with a timeout of
247 * 'clocks' APIC bus clock. During calibration we actually call 270 * 'clocks' APIC bus clock. During calibration we actually call
248 * this function twice on the boot CPU, once with a bogus timeout 271 * this function twice on the boot CPU, once with a bogus timeout
@@ -252,7 +275,6 @@ int lapic_get_maxlvt(void)
252 * We do reads before writes even if unnecessary, to get around the 275 * We do reads before writes even if unnecessary, to get around the
253 * P5 APIC double write bug. 276 * P5 APIC double write bug.
254 */ 277 */
255
256static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 278static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
257{ 279{
258 unsigned int lvtt_value, tmp_value; 280 unsigned int lvtt_value, tmp_value;
@@ -260,6 +282,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
260 lvtt_value = LOCAL_TIMER_VECTOR; 282 lvtt_value = LOCAL_TIMER_VECTOR;
261 if (!oneshot) 283 if (!oneshot)
262 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 284 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
285 if (!lapic_is_integrated())
286 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
287
263 if (!irqen) 288 if (!irqen)
264 lvtt_value |= APIC_LVT_MASKED; 289 lvtt_value |= APIC_LVT_MASKED;
265 290
@@ -269,12 +294,12 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
269 * Divide PICLK by 16 294 * Divide PICLK by 16
270 */ 295 */
271 tmp_value = apic_read(APIC_TDCR); 296 tmp_value = apic_read(APIC_TDCR);
272 apic_write(APIC_TDCR, (tmp_value 297 apic_write(APIC_TDCR,
273 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 298 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
274 | APIC_TDR_DIV_16); 299 APIC_TDR_DIV_16);
275 300
276 if (!oneshot) 301 if (!oneshot)
277 apic_write(APIC_TMICT, clocks); 302 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
278} 303}
279 304
280/* 305/*
@@ -444,7 +469,7 @@ static int __init calibrate_APIC_clock(void)
444 lapic_clockevent.min_delta_ns = 469 lapic_clockevent.min_delta_ns =
445 clockevent_delta2ns(0xF, &lapic_clockevent); 470 clockevent_delta2ns(0xF, &lapic_clockevent);
446 471
447 calibration_result = result / HZ; 472 calibration_result = (result * APIC_DIVISOR) / HZ;
448 473
449 /* 474 /*
450 * Do a sanity check on the APIC calibration result 475 * Do a sanity check on the APIC calibration result
@@ -466,10 +491,10 @@ static int __init calibrate_APIC_clock(void)
466void __init setup_boot_APIC_clock(void) 491void __init setup_boot_APIC_clock(void)
467{ 492{
468 /* 493 /*
469 * The local apic timer can be disabled via the kernel commandline. 494 * The local apic timer can be disabled via the kernel
470 * Register the lapic timer as a dummy clock event source on SMP 495 * commandline or from the CPU detection code. Register the lapic
471 * systems, so the broadcast mechanism is used. On UP systems simply 496 * timer as a dummy clock event source on SMP systems, so the
472 * ignore it. 497 * broadcast mechanism is used. On UP systems simply ignore it.
473 */ 498 */
474 if (disable_apic_timer) { 499 if (disable_apic_timer) {
475 printk(KERN_INFO "Disabling APIC timer\n"); 500 printk(KERN_INFO "Disabling APIC timer\n");
@@ -481,7 +506,9 @@ void __init setup_boot_APIC_clock(void)
481 return; 506 return;
482 } 507 }
483 508
484 printk(KERN_INFO "Using local APIC timer interrupts.\n"); 509 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
510 "calibrating APIC timer ...\n");
511
485 if (calibrate_APIC_clock()) { 512 if (calibrate_APIC_clock()) {
486 /* No broadcast on UP ! */ 513 /* No broadcast on UP ! */
487 if (num_possible_cpus() > 1) 514 if (num_possible_cpus() > 1)
@@ -500,6 +527,7 @@ void __init setup_boot_APIC_clock(void)
500 printk(KERN_WARNING "APIC timer registered as dummy," 527 printk(KERN_WARNING "APIC timer registered as dummy,"
501 " due to nmi_watchdog=%d!\n", nmi_watchdog); 528 " due to nmi_watchdog=%d!\n", nmi_watchdog);
502 529
530 /* Setup the lapic or request the broadcast */
503 setup_APIC_timer(); 531 setup_APIC_timer();
504} 532}
505 533
@@ -538,7 +566,11 @@ static void local_apic_timer_interrupt(void)
538 /* 566 /*
539 * the NMI deadlock-detector uses this. 567 * the NMI deadlock-detector uses this.
540 */ 568 */
569#ifdef CONFIG_X86_64
541 add_pda(apic_timer_irqs, 1); 570 add_pda(apic_timer_irqs, 1);
571#else
572 per_cpu(irq_stat, cpu).apic_timer_irqs++;
573#endif
542 574
543 evt->event_handler(evt); 575 evt->event_handler(evt);
544} 576}
@@ -569,6 +601,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
569 irq_enter(); 601 irq_enter();
570 local_apic_timer_interrupt(); 602 local_apic_timer_interrupt();
571 irq_exit(); 603 irq_exit();
604
572 set_irq_regs(old_regs); 605 set_irq_regs(old_regs);
573} 606}
574 607
@@ -622,6 +655,13 @@ void clear_local_APIC(void)
622 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); 655 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
623 } 656 }
624 657
658 /* lets not touch this if we didn't frob it */
659#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
660 if (maxlvt >= 5) {
661 v = apic_read(APIC_LVTTHMR);
662 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
663 }
664#endif
625 /* 665 /*
626 * Clean APIC state for other OSs: 666 * Clean APIC state for other OSs:
627 */ 667 */
@@ -632,8 +672,14 @@ void clear_local_APIC(void)
632 apic_write(APIC_LVTERR, APIC_LVT_MASKED); 672 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
633 if (maxlvt >= 4) 673 if (maxlvt >= 4)
634 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 674 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
635 apic_write(APIC_ESR, 0); 675
636 apic_read(APIC_ESR); 676 /* Integrated APIC (!82489DX) ? */
677 if (lapic_is_integrated()) {
678 if (maxlvt > 3)
679 /* Clear ESR due to Pentium errata 3AP and 11AP */
680 apic_write(APIC_ESR, 0);
681 apic_read(APIC_ESR);
682 }
637} 683}
638 684
639/** 685/**
@@ -652,8 +698,28 @@ void disable_local_APIC(void)
652 value = apic_read(APIC_SPIV); 698 value = apic_read(APIC_SPIV);
653 value &= ~APIC_SPIV_APIC_ENABLED; 699 value &= ~APIC_SPIV_APIC_ENABLED;
654 apic_write(APIC_SPIV, value); 700 apic_write(APIC_SPIV, value);
701
702#ifdef CONFIG_X86_32
703 /*
704 * When LAPIC was disabled by the BIOS and enabled by the kernel,
705 * restore the disabled state.
706 */
707 if (enabled_via_apicbase) {
708 unsigned int l, h;
709
710 rdmsr(MSR_IA32_APICBASE, l, h);
711 l &= ~MSR_IA32_APICBASE_ENABLE;
712 wrmsr(MSR_IA32_APICBASE, l, h);
713 }
714#endif
655} 715}
656 716
717/*
718 * If Linux enabled the LAPIC against the BIOS default disable it down before
719 * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and
720 * not power-off. Additionally clear all LVT entries before disable_local_APIC
721 * for the case where Linux didn't enable the LAPIC.
722 */
657void lapic_shutdown(void) 723void lapic_shutdown(void)
658{ 724{
659 unsigned long flags; 725 unsigned long flags;
@@ -663,7 +729,13 @@ void lapic_shutdown(void)
663 729
664 local_irq_save(flags); 730 local_irq_save(flags);
665 731
666 disable_local_APIC(); 732#ifdef CONFIG_X86_32
733 if (!enabled_via_apicbase)
734 clear_local_APIC();
735 else
736#endif
737 disable_local_APIC();
738
667 739
668 local_irq_restore(flags); 740 local_irq_restore(flags);
669} 741}
@@ -734,8 +806,11 @@ int __init verify_local_APIC(void)
734 */ 806 */
735void __init sync_Arb_IDs(void) 807void __init sync_Arb_IDs(void)
736{ 808{
737 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 809 /*
738 if (modern_apic()) 810 * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not
811 * needed on AMD.
812 */
813 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
739 return; 814 return;
740 815
741 /* 816 /*
@@ -744,8 +819,8 @@ void __init sync_Arb_IDs(void)
744 apic_wait_icr_idle(); 819 apic_wait_icr_idle();
745 820
746 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 821 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
747 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 822 apic_write(APIC_ICR, APIC_DEST_ALLINC |
748 | APIC_DM_INIT); 823 APIC_INT_LEVELTRIG | APIC_DM_INIT);
749} 824}
750 825
751/* 826/*
@@ -762,8 +837,6 @@ void __init init_bsp_APIC(void)
762 if (smp_found_config || !cpu_has_apic) 837 if (smp_found_config || !cpu_has_apic)
763 return; 838 return;
764 839
765 value = apic_read(APIC_LVR);
766
767 /* 840 /*
768 * Do not trust the local APIC being empty at bootup. 841 * Do not trust the local APIC being empty at bootup.
769 */ 842 */
@@ -775,7 +848,15 @@ void __init init_bsp_APIC(void)
775 value = apic_read(APIC_SPIV); 848 value = apic_read(APIC_SPIV);
776 value &= ~APIC_VECTOR_MASK; 849 value &= ~APIC_VECTOR_MASK;
777 value |= APIC_SPIV_APIC_ENABLED; 850 value |= APIC_SPIV_APIC_ENABLED;
778 value |= APIC_SPIV_FOCUS_DISABLED; 851
852#ifdef CONFIG_X86_32
853 /* This bit is reserved on P4/Xeon and should be cleared */
854 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
855 (boot_cpu_data.x86 == 15))
856 value &= ~APIC_SPIV_FOCUS_DISABLED;
857 else
858#endif
859 value |= APIC_SPIV_FOCUS_DISABLED;
779 value |= SPURIOUS_APIC_VECTOR; 860 value |= SPURIOUS_APIC_VECTOR;
780 apic_write(APIC_SPIV, value); 861 apic_write(APIC_SPIV, value);
781 862
@@ -784,9 +865,50 @@ void __init init_bsp_APIC(void)
784 */ 865 */
785 apic_write(APIC_LVT0, APIC_DM_EXTINT); 866 apic_write(APIC_LVT0, APIC_DM_EXTINT);
786 value = APIC_DM_NMI; 867 value = APIC_DM_NMI;
868 if (!lapic_is_integrated()) /* 82489DX */
869 value |= APIC_LVT_LEVEL_TRIGGER;
787 apic_write(APIC_LVT1, value); 870 apic_write(APIC_LVT1, value);
788} 871}
789 872
873static void __cpuinit lapic_setup_esr(void)
874{
875 unsigned long oldvalue, value, maxlvt;
876 if (lapic_is_integrated() && !esr_disable) {
877 if (esr_disable) {
878 /*
879 * Something untraceable is creating bad interrupts on
880 * secondary quads ... for the moment, just leave the
881 * ESR disabled - we can't do anything useful with the
882 * errors anyway - mbligh
883 */
884 printk(KERN_INFO "Leaving ESR disabled.\n");
885 return;
886 }
887 /* !82489DX */
888 maxlvt = lapic_get_maxlvt();
889 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
890 apic_write(APIC_ESR, 0);
891 oldvalue = apic_read(APIC_ESR);
892
893 /* enables sending errors */
894 value = ERROR_APIC_VECTOR;
895 apic_write(APIC_LVTERR, value);
896 /*
897 * spec says clear errors after enabling vector.
898 */
899 if (maxlvt > 3)
900 apic_write(APIC_ESR, 0);
901 value = apic_read(APIC_ESR);
902 if (value != oldvalue)
903 apic_printk(APIC_VERBOSE, "ESR value before enabling "
904 "vector: 0x%08lx after: 0x%08lx\n",
905 oldvalue, value);
906 } else {
907 printk(KERN_INFO "No ESR for 82489DX.\n");
908 }
909}
910
911
790/** 912/**
791 * setup_local_APIC - setup the local APIC 913 * setup_local_APIC - setup the local APIC
792 */ 914 */
@@ -892,21 +1014,20 @@ void __cpuinit setup_local_APIC(void)
892 preempt_enable(); 1014 preempt_enable();
893} 1015}
894 1016
895static void __cpuinit lapic_setup_esr(void)
896{
897 unsigned maxlvt = lapic_get_maxlvt();
898
899 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
900 /*
901 * spec says clear errors after enabling vector.
902 */
903 if (maxlvt > 3)
904 apic_write(APIC_ESR, 0);
905}
906
907void __cpuinit end_local_APIC_setup(void) 1017void __cpuinit end_local_APIC_setup(void)
908{ 1018{
909 lapic_setup_esr(); 1019 lapic_setup_esr();
1020
1021#ifdef CONFIG_X86_32
1022 {
1023 unsigned int value;
1024 /* Disable the local apic timer */
1025 value = apic_read(APIC_LVTT);
1026 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1027 apic_write(APIC_LVTT, value);
1028 }
1029#endif
1030
910 setup_apic_nmi_watchdog(NULL); 1031 setup_apic_nmi_watchdog(NULL);
911 apic_pm_activate(); 1032 apic_pm_activate();
912} 1033}
@@ -1108,6 +1229,8 @@ void __init init_apic_mappings(void)
1108 * This initializes the IO-APIC and APIC hardware if this is 1229 * This initializes the IO-APIC and APIC hardware if this is
1109 * a UP kernel. 1230 * a UP kernel.
1110 */ 1231 */
1232int apic_version[MAX_APICS];
1233
1111int __init APIC_init_uniprocessor(void) 1234int __init APIC_init_uniprocessor(void)
1112{ 1235{
1113 if (disable_apic) { 1236 if (disable_apic) {
@@ -1209,17 +1332,57 @@ asmlinkage void smp_error_interrupt(void)
1209} 1332}
1210 1333
1211/** 1334/**
1212 * * connect_bsp_APIC - attach the APIC to the interrupt system 1335 * connect_bsp_APIC - attach the APIC to the interrupt system
1213 * */ 1336 */
1214void __init connect_bsp_APIC(void) 1337void __init connect_bsp_APIC(void)
1215{ 1338{
1339#ifdef CONFIG_X86_32
1340 if (pic_mode) {
1341 /*
1342 * Do not trust the local APIC being empty at bootup.
1343 */
1344 clear_local_APIC();
1345 /*
1346 * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's
1347 * local APIC to INT and NMI lines.
1348 */
1349 apic_printk(APIC_VERBOSE, "leaving PIC mode, "
1350 "enabling APIC mode.\n");
1351 outb(0x70, 0x22);
1352 outb(0x01, 0x23);
1353 }
1354#endif
1216 enable_apic_mode(); 1355 enable_apic_mode();
1217} 1356}
1218 1357
1358/**
1359 * disconnect_bsp_APIC - detach the APIC from the interrupt system
1360 * @virt_wire_setup: indicates, whether virtual wire mode is selected
1361 *
1362 * Virtual wire mode is necessary to deliver legacy interrupts even when the
1363 * APIC is disabled.
1364 */
1219void disconnect_bsp_APIC(int virt_wire_setup) 1365void disconnect_bsp_APIC(int virt_wire_setup)
1220{ 1366{
1367 unsigned int value;
1368
1369#ifdef CONFIG_X86_32
1370 if (pic_mode) {
1371 /*
1372 * Put the board back into PIC mode (has an effect only on
1373 * certain older boards). Note that APIC interrupts, including
1374 * IPIs, won't work beyond this point! The only exception are
1375 * INIT IPIs.
1376 */
1377 apic_printk(APIC_VERBOSE, "disabling APIC mode, "
1378 "entering PIC mode.\n");
1379 outb(0x70, 0x22);
1380 outb(0x00, 0x23);
1381 return;
1382 }
1383#endif
1384
1221 /* Go back to Virtual Wire compatibility mode */ 1385 /* Go back to Virtual Wire compatibility mode */
1222 unsigned long value;
1223 1386
1224 /* For the spurious interrupt use vector F, and enable it */ 1387 /* For the spurious interrupt use vector F, and enable it */
1225 value = apic_read(APIC_SPIV); 1388 value = apic_read(APIC_SPIV);
@@ -1245,7 +1408,10 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1245 apic_write(APIC_LVT0, APIC_LVT_MASKED); 1408 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1246 } 1409 }
1247 1410
1248 /* For LVT1 make it edge triggered, active high, nmi and enabled */ 1411 /*
1412 * For LVT1 make it edge triggered, active high,
1413 * nmi and enabled
1414 */
1249 value = apic_read(APIC_LVT1); 1415 value = apic_read(APIC_LVT1);
1250 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1416 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1251 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1417 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
@@ -1260,9 +1426,20 @@ void __cpuinit generic_processor_info(int apicid, int version)
1260 int cpu; 1426 int cpu;
1261 cpumask_t tmp_map; 1427 cpumask_t tmp_map;
1262 1428
1429 /*
1430 * Validate version
1431 */
1432 if (version == 0x0) {
1433 printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
1434 "fixing up to 0x10. (tell your hw vendor)\n",
1435 version);
1436 version = 0x10;
1437 }
1438 apic_version[apicid] = version;
1439
1263 if (num_processors >= NR_CPUS) { 1440 if (num_processors >= NR_CPUS) {
1264 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1441 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1265 " Processor ignored.\n", NR_CPUS); 1442 " Processor ignored.\n", NR_CPUS);
1266 return; 1443 return;
1267 } 1444 }
1268 1445
@@ -1282,6 +1459,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1282 if (apicid > max_physical_apicid) 1459 if (apicid > max_physical_apicid)
1283 max_physical_apicid = apicid; 1460 max_physical_apicid = apicid;
1284 1461
1462#ifdef CONFIG_X86_32
1463 /*
1464 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1465 * but we need to work other dependencies like SMP_SUSPEND etc
1466 * before this can be done without some confusion.
1467 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
1468 * - Ashok Raj <ashok.raj@intel.com>
1469 */
1470 if (max_physical_apicid >= 8) {
1471 switch (boot_cpu_data.x86_vendor) {
1472 case X86_VENDOR_INTEL:
1473 if (!APIC_XAPIC(version)) {
1474 def_to_bigsmp = 0;
1475 break;
1476 }
1477 /* If P4 and above fall through */
1478 case X86_VENDOR_AMD:
1479 def_to_bigsmp = 1;
1480 }
1481 }
1482#endif
1483
1484#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1285 /* are we being called early in kernel startup? */ 1485 /* are we being called early in kernel startup? */
1286 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1486 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1287 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1487 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1293,6 +1493,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
1293 per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1493 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1294 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1494 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1295 } 1495 }
1496#endif
1296 1497
1297 cpu_set(cpu, cpu_possible_map); 1498 cpu_set(cpu, cpu_possible_map);
1298 cpu_set(cpu, cpu_present_map); 1499 cpu_set(cpu, cpu_present_map);
@@ -1309,9 +1510,11 @@ int hard_smp_processor_id(void)
1309#ifdef CONFIG_PM 1510#ifdef CONFIG_PM
1310 1511
1311static struct { 1512static struct {
1312 /* 'active' is true if the local APIC was enabled by us and 1513 /*
1313 not the BIOS; this signifies that we are also responsible 1514 * 'active' is true if the local APIC was enabled by us and
1314 for disabling it before entering apm/acpi suspend */ 1515 * not the BIOS; this signifies that we are also responsible
1516 * for disabling it before entering apm/acpi suspend
1517 */
1315 int active; 1518 int active;
1316 /* r/w apic fields */ 1519 /* r/w apic fields */
1317 unsigned int apic_id; 1520 unsigned int apic_id;
@@ -1352,10 +1555,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1352 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1555 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1353 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1556 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1354 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1557 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1355#ifdef CONFIG_X86_MCE_INTEL 1558#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1356 if (maxlvt >= 5) 1559 if (maxlvt >= 5)
1357 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1560 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1358#endif 1561#endif
1562
1359 local_irq_save(flags); 1563 local_irq_save(flags);
1360 disable_local_APIC(); 1564 disable_local_APIC();
1361 local_irq_restore(flags); 1565 local_irq_restore(flags);
@@ -1374,13 +1578,24 @@ static int lapic_resume(struct sys_device *dev)
1374 maxlvt = lapic_get_maxlvt(); 1578 maxlvt = lapic_get_maxlvt();
1375 1579
1376 local_irq_save(flags); 1580 local_irq_save(flags);
1377 if (!x2apic) { 1581
1582#ifdef CONFIG_X86_64
1583 if (x2apic)
1584 enable_x2apic();
1585 else
1586#endif
1587 {
1588 /*
1589 * Make sure the APICBASE points to the right address
1590 *
1591 * FIXME! This will be wrong if we ever support suspend on
1592 * SMP! We'll need to do this as part of the CPU restore!
1593 */
1378 rdmsr(MSR_IA32_APICBASE, l, h); 1594 rdmsr(MSR_IA32_APICBASE, l, h);
1379 l &= ~MSR_IA32_APICBASE_BASE; 1595 l &= ~MSR_IA32_APICBASE_BASE;
1380 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 1596 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1381 wrmsr(MSR_IA32_APICBASE, l, h); 1597 wrmsr(MSR_IA32_APICBASE, l, h);
1382 } else 1598 }
1383 enable_x2apic();
1384 1599
1385 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 1600 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1386 apic_write(APIC_ID, apic_pm_state.apic_id); 1601 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1390,7 +1605,7 @@ static int lapic_resume(struct sys_device *dev)
1390 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 1605 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1391 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 1606 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1392 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 1607 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1393#ifdef CONFIG_X86_MCE_INTEL 1608#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1394 if (maxlvt >= 5) 1609 if (maxlvt >= 5)
1395 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 1610 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1396#endif 1611#endif
@@ -1404,10 +1619,17 @@ static int lapic_resume(struct sys_device *dev)
1404 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 1619 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1405 apic_write(APIC_ESR, 0); 1620 apic_write(APIC_ESR, 0);
1406 apic_read(APIC_ESR); 1621 apic_read(APIC_ESR);
1622
1407 local_irq_restore(flags); 1623 local_irq_restore(flags);
1624
1408 return 0; 1625 return 0;
1409} 1626}
1410 1627
1628/*
1629 * This device has no shutdown method - fully functioning local APICs
1630 * are needed on every CPU up until machine_halt/restart/poweroff.
1631 */
1632
1411static struct sysdev_class lapic_sysclass = { 1633static struct sysdev_class lapic_sysclass = {
1412 .name = "lapic", 1634 .name = "lapic",
1413 .resume = lapic_resume, 1635 .resume = lapic_resume,
@@ -1533,28 +1755,7 @@ early_param("nox2apic", setup_nox2apic);
1533/* 1755/*
1534 * APIC command line parameters 1756 * APIC command line parameters
1535 */ 1757 */
1536static int __init apic_set_verbosity(char *str) 1758static int __init setup_disableapic(char *arg)
1537{
1538 if (str == NULL) {
1539 skip_ioapic_setup = 0;
1540 ioapic_force = 1;
1541 return 0;
1542 }
1543 if (strcmp("debug", str) == 0)
1544 apic_verbosity = APIC_DEBUG;
1545 else if (strcmp("verbose", str) == 0)
1546 apic_verbosity = APIC_VERBOSE;
1547 else {
1548 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1549 " use apic=verbose or apic=debug\n", str);
1550 return -EINVAL;
1551 }
1552
1553 return 0;
1554}
1555early_param("apic", apic_set_verbosity);
1556
1557static __init int setup_disableapic(char *str)
1558{ 1759{
1559 disable_apic = 1; 1760 disable_apic = 1;
1560 setup_clear_cpu_cap(X86_FEATURE_APIC); 1761 setup_clear_cpu_cap(X86_FEATURE_APIC);
@@ -1563,9 +1764,9 @@ static __init int setup_disableapic(char *str)
1563early_param("disableapic", setup_disableapic); 1764early_param("disableapic", setup_disableapic);
1564 1765
1565/* same as disableapic, for compatibility */ 1766/* same as disableapic, for compatibility */
1566static __init int setup_nolapic(char *str) 1767static int __init setup_nolapic(char *arg)
1567{ 1768{
1568 return setup_disableapic(str); 1769 return setup_disableapic(arg);
1569} 1770}
1570early_param("nolapic", setup_nolapic); 1771early_param("nolapic", setup_nolapic);
1571 1772
@@ -1576,14 +1777,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1576} 1777}
1577early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 1778early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1578 1779
1579static __init int setup_noapictimer(char *str) 1780static int __init parse_disable_apic_timer(char *arg)
1580{ 1781{
1581 if (str[0] != ' ' && str[0] != 0)
1582 return 0;
1583 disable_apic_timer = 1; 1782 disable_apic_timer = 1;
1584 return 1; 1783 return 0;
1784}
1785early_param("noapictimer", parse_disable_apic_timer);
1786
1787static int __init parse_nolapic_timer(char *arg)
1788{
1789 disable_apic_timer = 1;
1790 return 0;
1585} 1791}
1586__setup("noapictimer", setup_noapictimer); 1792early_param("nolapic_timer", parse_nolapic_timer);
1587 1793
1588static __init int setup_apicpmtimer(char *s) 1794static __init int setup_apicpmtimer(char *s)
1589{ 1795{
@@ -1593,6 +1799,31 @@ static __init int setup_apicpmtimer(char *s)
1593} 1799}
1594__setup("apicpmtimer", setup_apicpmtimer); 1800__setup("apicpmtimer", setup_apicpmtimer);
1595 1801
1802static int __init apic_set_verbosity(char *arg)
1803{
1804 if (!arg) {
1805#ifdef CONFIG_X86_64
1806 skip_ioapic_setup = 0;
1807 ioapic_force = 1;
1808 return 0;
1809#endif
1810 return -EINVAL;
1811 }
1812
1813 if (strcmp("debug", arg) == 0)
1814 apic_verbosity = APIC_DEBUG;
1815 else if (strcmp("verbose", arg) == 0)
1816 apic_verbosity = APIC_VERBOSE;
1817 else {
1818 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1819 " use apic=verbose or apic=debug\n", arg);
1820 return -EINVAL;
1821 }
1822
1823 return 0;
1824}
1825early_param("apic", apic_set_verbosity);
1826
1596static int __init lapic_insert_resource(void) 1827static int __init lapic_insert_resource(void)
1597{ 1828{
1598 if (!apic_phys) 1829 if (!apic_phys)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9ee24e6bc4b0..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,12 +228,12 @@
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h> 230#include <linux/jiffies.h>
231#include <linux/smp_lock.h>
232 231
233#include <asm/system.h> 232#include <asm/system.h>
234#include <asm/uaccess.h> 233#include <asm/uaccess.h>
235#include <asm/desc.h> 234#include <asm/desc.h>
236#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
237#include <asm/paravirt.h> 237#include <asm/paravirt.h>
238#include <asm/reboot.h> 238#include <asm/reboot.h>
239 239
@@ -2217,7 +2217,7 @@ static int __init apm_init(void)
2217 2217
2218 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2219 2219
2220 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2221 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2222 return -ENODEV; 2222 return -ENODEV;
2223 } 2223 }
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391c..fdd585f9c53d 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,11 +25,11 @@ x86_bios_strerror(long status)
25{ 25{
26 const char *str; 26 const char *str;
27 switch (status) { 27 switch (status) {
28 case 0: str = "Call completed without error"; break; 28 case 0: str = "Call completed without error"; break;
29 case -1: str = "Not implemented"; break; 29 case -1: str = "Not implemented"; break;
30 case -2: str = "Invalid argument"; break; 30 case -2: str = "Invalid argument"; break;
31 case -3: str = "Call completed with error"; break; 31 case -3: str = "Call completed with error"; break;
32 default: str = "Unknown BIOS status code"; break; 32 default: str = "Unknown BIOS status code"; break;
33 } 33 }
34 return str; 34 return str;
35} 35}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 403e689df0b8..7f0b45a5d788 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,15 +3,13 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10 10
11obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_CPU_SUP_AMD_32) += amd.o
14obj-$(CONFIG_CPU_SUP_AMD_64) += amd_64.o
15obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
16obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
17obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index d64ea6097ca7..32e73520adf7 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,26 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28{ 37{
29 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
30 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
31 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
33 } 50 }
34
35 /* Set MTRR capability flag if appropriate */
36 if (c->x86_model == 13 || c->x86_model == 9 ||
37 (c->x86_model == 8 && c->x86_mask >= 8))
38 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
39} 51}
40 52
41static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
42{ 55{
43 u32 l, h; 56 u32 l, h;
44 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
45 int r;
46 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
47#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
48 unsigned long long value; 304 unsigned long long value;
49 305
@@ -54,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
54 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
55 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
56 */ 312 */
57 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
58 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
59 value |= 1 << 6; 315 value |= 1 << 6;
60 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -64,209 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
64 early_init_amd(c); 320 early_init_amd(c);
65 321
66 /* 322 /*
67 * FIXME: We should handle the K5 here. Set up the write
68 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
69 * no bus pipeline)
70 */
71
72 /*
73 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
74 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
75 */ 325 */
76 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
77 327
78 r = get_model_name(c); 328#ifdef CONFIG_X86_64
329 /* On C+ stepping K8 rep microcode works well for copy/memset */
330 if (c->x86 == 0xf) {
331 u32 level;
79 332
80 switch (c->x86) { 333 level = cpuid_eax(1);
81 case 4: 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
82 /* 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
83 * General Systems BIOSen alias the cpu frequency registers
84 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
85 * drivers subsequently pokes it, and changes the CPU speed.
86 * Workaround : Remove the unneeded alias.
87 */
88#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
89#define CBAR_ENB (0x80000000)
90#define CBAR_KEY (0X000000CB)
91 if (c->x86_model == 9 || c->x86_model == 10) {
92 if (inl (CBAR) & CBAR_ENB)
93 outl (0 | CBAR_KEY, CBAR);
94 }
95 break;
96 case 5:
97 if (c->x86_model < 6) {
98 /* Based on AMD doc 20734R - June 2000 */
99 if (c->x86_model == 0) {
100 clear_cpu_cap(c, X86_FEATURE_APIC);
101 set_cpu_cap(c, X86_FEATURE_PGE);
102 }
103 break;
104 }
105
106 if (c->x86_model == 6 && c->x86_mask == 1) {
107 const int K6_BUG_LOOP = 1000000;
108 int n;
109 void (*f_vide)(void);
110 unsigned long d, d2;
111
112 printk(KERN_INFO "AMD K6 stepping B detected - ");
113
114 /*
115 * It looks like AMD fixed the 2.6.2 bug and improved indirect
116 * calls at the same time.
117 */
118
119 n = K6_BUG_LOOP;
120 f_vide = vide;
121 rdtscl(d);
122 while (n--)
123 f_vide();
124 rdtscl(d2);
125 d = d2-d;
126
127 if (d > 20*K6_BUG_LOOP)
128 printk("system stability may be impaired when more than 32 MB are used.\n");
129 else
130 printk("probably OK (after B9730xxxx).\n");
131 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
132 }
133
134 /* K6 with old style WHCR */
135 if (c->x86_model < 8 ||
136 (c->x86_model == 8 && c->x86_mask < 8)) {
137 /* We can only write allocate on the low 508Mb */
138 if (mbytes > 508)
139 mbytes = 508;
140
141 rdmsr(MSR_K6_WHCR, l, h);
142 if ((l&0x0000FFFF) == 0) {
143 unsigned long flags;
144 l = (1<<0)|((mbytes/4)<<1);
145 local_irq_save(flags);
146 wbinvd();
147 wrmsr(MSR_K6_WHCR, l, h);
148 local_irq_restore(flags);
149 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
150 mbytes);
151 }
152 break;
153 }
154
155 if ((c->x86_model == 8 && c->x86_mask > 7) ||
156 c->x86_model == 9 || c->x86_model == 13) {
157 /* The more serious chips .. */
158
159 if (mbytes > 4092)
160 mbytes = 4092;
161
162 rdmsr(MSR_K6_WHCR, l, h);
163 if ((l&0xFFFF0000) == 0) {
164 unsigned long flags;
165 l = ((mbytes>>2)<<22)|(1<<16);
166 local_irq_save(flags);
167 wbinvd();
168 wrmsr(MSR_K6_WHCR, l, h);
169 local_irq_restore(flags);
170 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
171 mbytes);
172 }
173
174 break;
175 }
176
177 if (c->x86_model == 10) {
178 /* AMD Geode LX is model 10 */
179 /* placeholder for any needed mods */
180 break;
181 }
182 break;
183 case 6: /* An Athlon/Duron */
184
185 /*
186 * Bit 15 of Athlon specific MSR 15, needs to be 0
187 * to enable SSE on Palomino/Morgan/Barton CPU's.
188 * If the BIOS didn't enable it already, enable it here.
189 */
190 if (c->x86_model >= 6 && c->x86_model <= 10) {
191 if (!cpu_has(c, X86_FEATURE_XMM)) {
192 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
193 rdmsr(MSR_K7_HWCR, l, h);
194 l &= ~0x00008000;
195 wrmsr(MSR_K7_HWCR, l, h);
196 set_cpu_cap(c, X86_FEATURE_XMM);
197 }
198 }
199
200 /*
201 * It's been determined by AMD that Athlons since model 8 stepping 1
202 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
203 * As per AMD technical note 27212 0.2
204 */
205 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
206 rdmsr(MSR_K7_CLK_CTL, l, h);
207 if ((l & 0xfff00000) != 0x20000000) {
208 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
209 ((l & 0x000fffff)|0x20000000));
210 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
211 }
212 }
213 break;
214 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
215 346
216 switch (c->x86) { 347 switch (c->x86) {
217 case 15: 348 case 4:
218 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
219 case 0x10:
220 case 0x11:
221 set_cpu_cap(c, X86_FEATURE_K8);
222 break; 350 break;
223 case 6: 351 case 5:
224 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
225 break; 356 break;
226 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
227 if (c->x86 >= 6) 365 if (c->x86 >= 6)
228 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
229 367
230 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
231 369 switch (c->x86) {
232 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
233 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
234 377
235#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
236 /*
237 * On a AMD multi core setup the lower bits of the APIC id
238 * distinguish the cores.
239 */
240 if (c->x86_max_cores > 1) {
241 int cpu = smp_processor_id();
242 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
243 379
244 if (bits == 0) { 380 /* Multi core CPU? */
245 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
246 bits++; 382 amd_detect_cmp(c);
247 } 383 srat_detect_node(c);
248 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
249 c->phys_proc_id >>= bits;
250 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
251 cpu, c->x86_max_cores, c->cpu_core_id);
252 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
253#endif 388#endif
254 389
255 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
256 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
257 num_cache_leaves = 4; 392 num_cache_leaves = 4;
258 else 393 else
259 num_cache_leaves = 3; 394 num_cache_leaves = 3;
260 } 395 }
261 396
262 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
263 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
264 clear_cpu_cap(c, X86_FEATURE_MCE);
265 399
266 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
267 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
268} 433}
269 434
435#ifdef CONFIG_X86_32
270static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
271{ 437{
272 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -279,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
279 } 445 }
280 return size; 446 return size;
281} 447}
448#endif
282 449
283static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
284 .c_vendor = "AMD", 451 .c_vendor = "AMD",
285 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
286 .c_models = { 454 .c_models = {
287 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
288 { 456 {
@@ -295,9 +463,10 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
295 } 463 }
296 }, 464 },
297 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
298 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
299 .c_init = init_amd, 469 .c_init = init_amd,
300 .c_size_cache = amd_size_cache,
301 .c_x86_vendor = X86_VENDOR_AMD, 470 .c_x86_vendor = X86_VENDOR_AMD,
302}; 471};
303 472
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1c721c0c49f..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
120}
121
122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
123{
124 unsigned level;
125
126#ifdef CONFIG_SMP
127 unsigned long value;
128
129 /*
130 * Disable TLB flush filter by setting HWCR.FFDIS on K8
131 * bit 6 of msr C001_0015
132 *
133 * Errata 63 for SH-B3 steppings
134 * Errata 122 for all steppings (F+ have it disabled by default)
135 */
136 if (c->x86 == 0xf) {
137 rdmsrl(MSR_K8_HWCR, value);
138 value |= 1 << 6;
139 wrmsrl(MSR_K8_HWCR, value);
140 }
141#endif
142
143 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
144 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
145 clear_cpu_cap(c, 0*32+31);
146
147 /* On C+ stepping K8 rep microcode works well for copy/memset */
148 if (c->x86 == 0xf) {
149 level = cpuid_eax(1);
150 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
151 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
152 }
153 if (c->x86 == 0x10 || c->x86 == 0x11)
154 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
155
156 /* Enable workaround for FXSAVE leak */
157 if (c->x86 >= 6)
158 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
159
160 level = get_model_name(c);
161 if (!level) {
162 switch (c->x86) {
163 case 0xf:
164 /* Should distinguish Models here, but this is only
165 a fallback anyways. */
166 strcpy(c->x86_model_id, "Hammer");
167 break;
168 }
169 }
170 display_cacheinfo(c);
171
172 /* Multi core CPU? */
173 if (c->extended_cpuid_level >= 0x80000008)
174 amd_detect_cmp(c);
175
176 if (c->extended_cpuid_level >= 0x80000006 &&
177 (cpuid_edx(0x80000006) & 0xf000))
178 num_cache_leaves = 4;
179 else
180 num_cache_leaves = 3;
181
182 if (c->x86 >= 0xf && c->x86 <= 0x11)
183 set_cpu_cap(c, X86_FEATURE_K8);
184
185 /* MFENCE stops RDTSC speculation */
186 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
187
188 if (c->x86 == 0x10) {
189 /* do this for boot cpu */
190 if (c == &boot_cpu_data)
191 check_enable_amd_mmconf_dmi();
192
193 fam10h_check_enable_mmcfg();
194 }
195
196 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
197 unsigned long long tseg;
198
199 /*
200 * Split up direct mapping around the TSEG SMM area.
201 * Don't do it for gbpages because there seems very little
202 * benefit in doing so.
203 */
204 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
205 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
206 if ((tseg>>PMD_SHIFT) <
207 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
208 ((tseg>>PMD_SHIFT) <
209 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
210 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
211 set_memory_4k((unsigned long)__va(tseg), 1);
212 }
213 }
214}
215
216static struct cpu_dev amd_cpu_dev __cpuinitdata = {
217 .c_vendor = "AMD",
218 .c_ident = { "AuthenticAMD" },
219 .c_early_init = early_init_amd,
220 .c_init = init_amd,
221 .c_x86_vendor = X86_VENDOR_AMD,
222};
223
224cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e5f6d89521bf..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 49cfc6d2f2fb..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7d5a07f0fd24..7581b62df184 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,29 +1,61 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
16#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h>
17#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
18#include <asm/mpspec.h> 24#include <asm/mpspec.h>
19#include <asm/apic.h> 25#include <asm/apic.h>
20#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
21#endif 28#endif
22 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
23#include "cpu.h" 39#include "cpu.h"
24 40
25static struct cpu_dev *this_cpu __cpuinitdata; 41static struct cpu_dev *this_cpu __cpuinitdata;
26 42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
27DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { 59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
28 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
29 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
@@ -58,8 +90,10 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
58 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
59 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
60} }; 92} };
93#endif
61EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
62 95
96#ifdef CONFIG_X86_32
63static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
64static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
65 99
@@ -70,34 +104,6 @@ static int __init cachesize_setup(char *str)
70} 104}
71__setup("cachesize=", cachesize_setup); 105__setup("cachesize=", cachesize_setup);
72 106
73/*
74 * Naming convention should be: <Name> [(<Codename>)]
75 * This table only is used unless init_<vendor>() below doesn't set it;
76 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
77 *
78 */
79
80/* Look up CPU names by table lookup. */
81static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
82{
83 struct cpu_model_info *info;
84
85 if (c->x86_model >= 16)
86 return NULL; /* Range check */
87
88 if (!this_cpu)
89 return NULL;
90
91 info = this_cpu->c_models;
92
93 while (info && info->family) {
94 if (info->family == c->x86)
95 return info->model_names[c->x86_model];
96 info++;
97 }
98 return NULL; /* Not found */
99}
100
101static int __init x86_fxsr_setup(char *s) 107static int __init x86_fxsr_setup(char *s)
102{ 108{
103 setup_clear_cpu_cap(X86_FEATURE_FXSR); 109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
@@ -162,6 +168,48 @@ static int __init x86_serial_nr_setup(char *s)
162 return 1; 168 return 1;
163} 169}
164__setup("serialnumber", x86_serial_nr_setup); 170__setup("serialnumber", x86_serial_nr_setup);
171#else
172static inline int flag_is_changeable_p(u32 flag)
173{
174 return 1;
175}
176/* Probe for the CPUID instruction */
177static inline int have_cpuid_p(void)
178{
179 return 1;
180}
181static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
182{
183}
184#endif
185
186/*
187 * Naming convention should be: <Name> [(<Codename>)]
188 * This table only is used unless init_<vendor>() below doesn't set it;
189 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
190 *
191 */
192
193/* Look up CPU names by table lookup. */
194static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
195{
196 struct cpu_model_info *info;
197
198 if (c->x86_model >= 16)
199 return NULL; /* Range check */
200
201 if (!this_cpu)
202 return NULL;
203
204 info = this_cpu->c_models;
205
206 while (info && info->family) {
207 if (info->family == c->x86)
208 return info->model_names[c->x86_model];
209 info++;
210 }
211 return NULL; /* Not found */
212}
165 213
166__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 214__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
167 215
@@ -174,13 +222,18 @@ void switch_to_new_gdt(void)
174 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 222 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
175 gdt_descr.size = GDT_SIZE - 1; 223 gdt_descr.size = GDT_SIZE - 1;
176 load_gdt(&gdt_descr); 224 load_gdt(&gdt_descr);
225#ifdef CONFIG_X86_32
177 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); 226 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
227#endif
178} 228}
179 229
180static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 230static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
181 231
182static void __cpuinit default_init(struct cpuinfo_x86 *c) 232static void __cpuinit default_init(struct cpuinfo_x86 *c)
183{ 233{
234#ifdef CONFIG_X86_64
235 display_cacheinfo(c);
236#else
184 /* Not much we can do here... */ 237 /* Not much we can do here... */
185 /* Check if at least it has cpuid */ 238 /* Check if at least it has cpuid */
186 if (c->cpuid_level == -1) { 239 if (c->cpuid_level == -1) {
@@ -190,6 +243,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
190 else if (c->x86 == 3) 243 else if (c->x86 == 3)
191 strcpy(c->x86_model_id, "386"); 244 strcpy(c->x86_model_id, "386");
192 } 245 }
246#endif
193} 247}
194 248
195static struct cpu_dev __cpuinitdata default_cpu = { 249static struct cpu_dev __cpuinitdata default_cpu = {
@@ -198,13 +252,13 @@ static struct cpu_dev __cpuinitdata default_cpu = {
198 .c_x86_vendor = X86_VENDOR_UNKNOWN, 252 .c_x86_vendor = X86_VENDOR_UNKNOWN,
199}; 253};
200 254
201int __cpuinit get_model_name(struct cpuinfo_x86 *c) 255static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
202{ 256{
203 unsigned int *v; 257 unsigned int *v;
204 char *p, *q; 258 char *p, *q;
205 259
206 if (c->extended_cpuid_level < 0x80000004) 260 if (c->extended_cpuid_level < 0x80000004)
207 return 0; 261 return;
208 262
209 v = (unsigned int *) c->x86_model_id; 263 v = (unsigned int *) c->x86_model_id;
210 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 264 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -223,8 +277,6 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
223 while (q <= &c->x86_model_id[48]) 277 while (q <= &c->x86_model_id[48])
224 *q++ = '\0'; /* Zero-pad the rest */ 278 *q++ = '\0'; /* Zero-pad the rest */
225 } 279 }
226
227 return 1;
228} 280}
229 281
230void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 282void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
@@ -238,6 +290,10 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
238 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 290 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
239 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 291 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
240 c->x86_cache_size = (ecx>>24) + (edx>>24); 292 c->x86_cache_size = (ecx>>24) + (edx>>24);
293#ifdef CONFIG_X86_64
294 /* On K8 L1 TLB is inclusive, so don't count it */
295 c->x86_tlbsize = 0;
296#endif
241 } 297 }
242 298
243 if (n < 0x80000006) /* Some chips just has a large L1. */ 299 if (n < 0x80000006) /* Some chips just has a large L1. */
@@ -246,6 +302,9 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
246 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); 302 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
247 l2size = ecx >> 16; 303 l2size = ecx >> 16;
248 304
305#ifdef CONFIG_X86_64
306 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
307#else
249 /* do processor-specific cache resizing */ 308 /* do processor-specific cache resizing */
250 if (this_cpu->c_size_cache) 309 if (this_cpu->c_size_cache)
251 l2size = this_cpu->c_size_cache(c, l2size); 310 l2size = this_cpu->c_size_cache(c, l2size);
@@ -256,6 +315,7 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
256 315
257 if (l2size == 0) 316 if (l2size == 0)
258 return; /* Again, no L2 cache is possible */ 317 return; /* Again, no L2 cache is possible */
318#endif
259 319
260 c->x86_cache_size = l2size; 320 c->x86_cache_size = l2size;
261 321
@@ -263,9 +323,9 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
263 l2size, ecx & 0xFF); 323 l2size, ecx & 0xFF);
264} 324}
265 325
266#ifdef CONFIG_X86_HT
267void __cpuinit detect_ht(struct cpuinfo_x86 *c) 326void __cpuinit detect_ht(struct cpuinfo_x86 *c)
268{ 327{
328#ifdef CONFIG_X86_HT
269 u32 eax, ebx, ecx, edx; 329 u32 eax, ebx, ecx, edx;
270 int index_msb, core_bits; 330 int index_msb, core_bits;
271 331
@@ -275,6 +335,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
275 if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) 335 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
276 goto out; 336 goto out;
277 337
338 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
339 return;
340
278 cpuid(1, &eax, &ebx, &ecx, &edx); 341 cpuid(1, &eax, &ebx, &ecx, &edx);
279 342
280 smp_num_siblings = (ebx & 0xff0000) >> 16; 343 smp_num_siblings = (ebx & 0xff0000) >> 16;
@@ -291,8 +354,11 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
291 } 354 }
292 355
293 index_msb = get_count_order(smp_num_siblings); 356 index_msb = get_count_order(smp_num_siblings);
357#ifdef CONFIG_X86_64
358 c->phys_proc_id = phys_pkg_id(index_msb);
359#else
294 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 360 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
295 361#endif
296 362
297 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 363 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
298 364
@@ -300,8 +366,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
300 366
301 core_bits = get_count_order(c->x86_max_cores); 367 core_bits = get_count_order(c->x86_max_cores);
302 368
369#ifdef CONFIG_X86_64
370 c->cpu_core_id = phys_pkg_id(index_msb) &
371 ((1 << core_bits) - 1);
372#else
303 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 373 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
304 ((1 << core_bits) - 1); 374 ((1 << core_bits) - 1);
375#endif
305 } 376 }
306 377
307out: 378out:
@@ -311,8 +382,8 @@ out:
311 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 382 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
312 c->cpu_core_id); 383 c->cpu_core_id);
313 } 384 }
314}
315#endif 385#endif
386}
316 387
317static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) 388static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
318{ 389{
@@ -335,7 +406,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
335 406
336 if (!printed) { 407 if (!printed) {
337 printed++; 408 printed++;
338 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 409 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
339 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 410 printk(KERN_ERR "CPU: Your system may be unstable.\n");
340 } 411 }
341 412
@@ -392,7 +463,47 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
392 c->x86_capability[6] = cpuid_ecx(0x80000001); 463 c->x86_capability[6] = cpuid_ecx(0x80000001);
393 } 464 }
394 } 465 }
466
467#ifdef CONFIG_X86_64
468 if (c->extended_cpuid_level >= 0x80000008) {
469 u32 eax = cpuid_eax(0x80000008);
470
471 c->x86_virt_bits = (eax >> 8) & 0xff;
472 c->x86_phys_bits = eax & 0xff;
473 }
474#endif
475
476 if (c->extended_cpuid_level >= 0x80000007)
477 c->x86_power = cpuid_edx(0x80000007);
478
395} 479}
480
481static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
482{
483#ifdef CONFIG_X86_32
484 int i;
485
486 /*
487 * First of all, decide if this is a 486 or higher
488 * It's a 486 if we can modify the AC flag
489 */
490 if (flag_is_changeable_p(X86_EFLAGS_AC))
491 c->x86 = 4;
492 else
493 c->x86 = 3;
494
495 for (i = 0; i < X86_VENDOR_NUM; i++)
496 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
497 c->x86_vendor_id[0] = 0;
498 cpu_devs[i]->c_identify(c);
499 if (c->x86_vendor_id[0]) {
500 get_cpu_vendor(c);
501 break;
502 }
503 }
504#endif
505}
506
396/* 507/*
397 * Do minimum CPU detection early. 508 * Do minimum CPU detection early.
398 * Fields really needed: vendor, cpuid_level, family, model, mask, 509 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -404,16 +515,23 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
404 */ 515 */
405static void __init early_identify_cpu(struct cpuinfo_x86 *c) 516static void __init early_identify_cpu(struct cpuinfo_x86 *c)
406{ 517{
518#ifdef CONFIG_X86_64
519 c->x86_clflush_size = 64;
520#else
407 c->x86_clflush_size = 32; 521 c->x86_clflush_size = 32;
522#endif
408 c->x86_cache_alignment = c->x86_clflush_size; 523 c->x86_cache_alignment = c->x86_clflush_size;
409 524
410 if (!have_cpuid_p())
411 return;
412
413 memset(&c->x86_capability, 0, sizeof c->x86_capability); 525 memset(&c->x86_capability, 0, sizeof c->x86_capability);
414
415 c->extended_cpuid_level = 0; 526 c->extended_cpuid_level = 0;
416 527
528 if (!have_cpuid_p())
529 identify_cpu_without_cpuid(c);
530
531 /* cyrix could have cpuid enabled via c_identify()*/
532 if (!have_cpuid_p())
533 return;
534
417 cpu_detect(c); 535 cpu_detect(c);
418 536
419 get_cpu_vendor(c); 537 get_cpu_vendor(c);
@@ -454,39 +572,27 @@ void __init early_cpu_init(void)
454 572
455/* 573/*
456 * The NOPL instruction is supposed to exist on all CPUs with 574 * The NOPL instruction is supposed to exist on all CPUs with
457 * family >= 6, unfortunately, that's not true in practice because 575 * family >= 6; unfortunately, that's not true in practice because
458 * of early VIA chips and (more importantly) broken virtualizers that 576 * of early VIA chips and (more importantly) broken virtualizers that
459 * are not easy to detect. Hence, probe for it based on first 577 * are not easy to detect. In the latter case it doesn't even *fail*
460 * principles. 578 * reliably, so probing for it doesn't even work. Disable it completely
579 * unless we can find a reliable way to detect all the broken cases.
461 */ 580 */
462static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) 581static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
463{ 582{
464 const u32 nopl_signature = 0x888c53b1; /* Random number */
465 u32 has_nopl = nopl_signature;
466
467 clear_cpu_cap(c, X86_FEATURE_NOPL); 583 clear_cpu_cap(c, X86_FEATURE_NOPL);
468 if (c->x86 >= 6) {
469 asm volatile("\n"
470 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
471 "2:\n"
472 " .section .fixup,\"ax\"\n"
473 "3: xor %0,%0\n"
474 " jmp 2b\n"
475 " .previous\n"
476 _ASM_EXTABLE(1b,3b)
477 : "+a" (has_nopl));
478
479 if (has_nopl == nopl_signature)
480 set_cpu_cap(c, X86_FEATURE_NOPL);
481 }
482} 584}
483 585
484static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 586static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
485{ 587{
588 c->extended_cpuid_level = 0;
589
486 if (!have_cpuid_p()) 590 if (!have_cpuid_p())
487 return; 591 identify_cpu_without_cpuid(c);
488 592
489 c->extended_cpuid_level = 0; 593 /* cyrix could have cpuid enabled via c_identify()*/
594 if (!have_cpuid_p())
595 return;
490 596
491 cpu_detect(c); 597 cpu_detect(c);
492 598
@@ -496,16 +602,20 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
496 602
497 if (c->cpuid_level >= 0x00000001) { 603 if (c->cpuid_level >= 0x00000001) {
498 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; 604 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
499#ifdef CONFIG_X86_HT 605#ifdef CONFIG_X86_32
606# ifdef CONFIG_X86_HT
500 c->apicid = phys_pkg_id(c->initial_apicid, 0); 607 c->apicid = phys_pkg_id(c->initial_apicid, 0);
501 c->phys_proc_id = c->initial_apicid; 608# else
502#else
503 c->apicid = c->initial_apicid; 609 c->apicid = c->initial_apicid;
610# endif
611#endif
612
613#ifdef CONFIG_X86_HT
614 c->phys_proc_id = c->initial_apicid;
504#endif 615#endif
505 } 616 }
506 617
507 if (c->extended_cpuid_level >= 0x80000004) 618 get_model_name(c); /* Default name */
508 get_model_name(c); /* Default name */
509 619
510 init_scattered_cpuid_features(c); 620 init_scattered_cpuid_features(c);
511 detect_nopl(c); 621 detect_nopl(c);
@@ -521,30 +631,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
521 c->loops_per_jiffy = loops_per_jiffy; 631 c->loops_per_jiffy = loops_per_jiffy;
522 c->x86_cache_size = -1; 632 c->x86_cache_size = -1;
523 c->x86_vendor = X86_VENDOR_UNKNOWN; 633 c->x86_vendor = X86_VENDOR_UNKNOWN;
524 c->cpuid_level = -1; /* CPUID not detected */
525 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 634 c->x86_model = c->x86_mask = 0; /* So far unknown... */
526 c->x86_vendor_id[0] = '\0'; /* Unset */ 635 c->x86_vendor_id[0] = '\0'; /* Unset */
527 c->x86_model_id[0] = '\0'; /* Unset */ 636 c->x86_model_id[0] = '\0'; /* Unset */
528 c->x86_max_cores = 1; 637 c->x86_max_cores = 1;
638 c->x86_coreid_bits = 0;
639#ifdef CONFIG_X86_64
640 c->x86_clflush_size = 64;
641#else
642 c->cpuid_level = -1; /* CPUID not detected */
529 c->x86_clflush_size = 32; 643 c->x86_clflush_size = 32;
644#endif
645 c->x86_cache_alignment = c->x86_clflush_size;
530 memset(&c->x86_capability, 0, sizeof c->x86_capability); 646 memset(&c->x86_capability, 0, sizeof c->x86_capability);
531 647
532 if (!have_cpuid_p()) {
533 /*
534 * First of all, decide if this is a 486 or higher
535 * It's a 486 if we can modify the AC flag
536 */
537 if (flag_is_changeable_p(X86_EFLAGS_AC))
538 c->x86 = 4;
539 else
540 c->x86 = 3;
541 }
542
543 generic_identify(c); 648 generic_identify(c);
544 649
545 if (this_cpu->c_identify) 650 if (this_cpu->c_identify)
546 this_cpu->c_identify(c); 651 this_cpu->c_identify(c);
547 652
653#ifdef CONFIG_X86_64
654 c->apicid = phys_pkg_id(0);
655#endif
656
548 /* 657 /*
549 * Vendor-specific initialization. In this section we 658 * Vendor-specific initialization. In this section we
550 * canonicalize the feature flags, meaning if there are 659 * canonicalize the feature flags, meaning if there are
@@ -578,6 +687,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
578 c->x86, c->x86_model); 687 c->x86, c->x86_model);
579 } 688 }
580 689
690#ifdef CONFIG_X86_64
691 detect_ht(c);
692#endif
693
581 /* 694 /*
582 * On SMP, boot_cpu_data holds the common feature set between 695 * On SMP, boot_cpu_data holds the common feature set between
583 * all CPUs; so make sure that we indicate which features are 696 * all CPUs; so make sure that we indicate which features are
@@ -594,24 +707,34 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
594 for (i = 0; i < NCAPINTS; i++) 707 for (i = 0; i < NCAPINTS; i++)
595 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 708 c->x86_capability[i] &= ~cleared_cpu_caps[i];
596 709
710#ifdef CONFIG_X86_MCE
597 /* Init Machine Check Exception if available. */ 711 /* Init Machine Check Exception if available. */
598 mcheck_init(c); 712 mcheck_init(c);
713#endif
599 714
600 select_idle_routine(c); 715 select_idle_routine(c);
716
717#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
718 numa_add_cpu(smp_processor_id());
719#endif
601} 720}
602 721
603void __init identify_boot_cpu(void) 722void __init identify_boot_cpu(void)
604{ 723{
605 identify_cpu(&boot_cpu_data); 724 identify_cpu(&boot_cpu_data);
725#ifdef CONFIG_X86_32
606 sysenter_setup(); 726 sysenter_setup();
607 enable_sep_cpu(); 727 enable_sep_cpu();
728#endif
608} 729}
609 730
610void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 731void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
611{ 732{
612 BUG_ON(c == &boot_cpu_data); 733 BUG_ON(c == &boot_cpu_data);
613 identify_cpu(c); 734 identify_cpu(c);
735#ifdef CONFIG_X86_32
614 enable_sep_cpu(); 736 enable_sep_cpu();
737#endif
615 mtrr_ap_init(); 738 mtrr_ap_init();
616} 739}
617 740
@@ -709,6 +832,89 @@ __setup("clearcpuid=", setup_disablecpuid);
709 832
710cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 833cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
711 834
835#ifdef CONFIG_X86_64
836struct x8664_pda **_cpu_pda __read_mostly;
837EXPORT_SYMBOL(_cpu_pda);
838
839struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
840
841char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
842
843void __cpuinit pda_init(int cpu)
844{
845 struct x8664_pda *pda = cpu_pda(cpu);
846
847 /* Setup up data that may be needed in __get_free_pages early */
848 loadsegment(fs, 0);
849 loadsegment(gs, 0);
850 /* Memory clobbers used to order PDA accessed */
851 mb();
852 wrmsrl(MSR_GS_BASE, pda);
853 mb();
854
855 pda->cpunumber = cpu;
856 pda->irqcount = -1;
857 pda->kernelstack = (unsigned long)stack_thread_info() -
858 PDA_STACKOFFSET + THREAD_SIZE;
859 pda->active_mm = &init_mm;
860 pda->mmu_state = 0;
861
862 if (cpu == 0) {
863 /* others are initialized in smpboot.c */
864 pda->pcurrent = &init_task;
865 pda->irqstackptr = boot_cpu_stack;
866 pda->irqstackptr += IRQSTACKSIZE - 64;
867 } else {
868 if (!pda->irqstackptr) {
869 pda->irqstackptr = (char *)
870 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
871 if (!pda->irqstackptr)
872 panic("cannot allocate irqstack for cpu %d",
873 cpu);
874 pda->irqstackptr += IRQSTACKSIZE - 64;
875 }
876
877 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
878 pda->nodenumber = cpu_to_node(cpu);
879 }
880}
881
882char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
883 DEBUG_STKSZ] __page_aligned_bss;
884
885extern asmlinkage void ignore_sysret(void);
886
887/* May not be marked __init: used by software suspend */
888void syscall_init(void)
889{
890 /*
891 * LSTAR and STAR live in a bit strange symbiosis.
892 * They both write to the same internal register. STAR allows to
893 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
894 */
895 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
896 wrmsrl(MSR_LSTAR, system_call);
897 wrmsrl(MSR_CSTAR, ignore_sysret);
898
899#ifdef CONFIG_IA32_EMULATION
900 syscall32_cpu_init();
901#endif
902
903 /* Flags to clear on syscall */
904 wrmsrl(MSR_SYSCALL_MASK,
905 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
906}
907
908unsigned long kernel_eflags;
909
910/*
911 * Copies of the original ist values from the tss are only accessed during
912 * debugging, no special alignment required.
913 */
914DEFINE_PER_CPU(struct orig_ist, orig_ist);
915
916#else
917
712/* Make sure %fs is initialized properly in idle threads */ 918/* Make sure %fs is initialized properly in idle threads */
713struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 919struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
714{ 920{
@@ -716,13 +922,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
716 regs->fs = __KERNEL_PERCPU; 922 regs->fs = __KERNEL_PERCPU;
717 return regs; 923 return regs;
718} 924}
925#endif
719 926
720/* 927/*
721 * cpu_init() initializes state that is per-CPU. Some data is already 928 * cpu_init() initializes state that is per-CPU. Some data is already
722 * initialized (naturally) in the bootstrap process, such as the GDT 929 * initialized (naturally) in the bootstrap process, such as the GDT
723 * and IDT. We reload them nevertheless, this function acts as a 930 * and IDT. We reload them nevertheless, this function acts as a
724 * 'CPU state barrier', nothing should get across. 931 * 'CPU state barrier', nothing should get across.
932 * A lot of state is already set up in PDA init for 64 bit
725 */ 933 */
934#ifdef CONFIG_X86_64
935void __cpuinit cpu_init(void)
936{
937 int cpu = stack_smp_processor_id();
938 struct tss_struct *t = &per_cpu(init_tss, cpu);
939 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
940 unsigned long v;
941 char *estacks = NULL;
942 struct task_struct *me;
943 int i;
944
945 /* CPU 0 is initialised in head64.c */
946 if (cpu != 0)
947 pda_init(cpu);
948 else
949 estacks = boot_exception_stacks;
950
951 me = current;
952
953 if (cpu_test_and_set(cpu, cpu_initialized))
954 panic("CPU#%d already initialized!\n", cpu);
955
956 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
957
958 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
959
960 /*
961 * Initialize the per-CPU GDT with the boot GDT,
962 * and set up the GDT descriptor:
963 */
964
965 switch_to_new_gdt();
966 load_idt((const struct desc_ptr *)&idt_descr);
967
968 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
969 syscall_init();
970
971 wrmsrl(MSR_FS_BASE, 0);
972 wrmsrl(MSR_KERNEL_GS_BASE, 0);
973 barrier();
974
975 check_efer();
976 if (cpu != 0 && x2apic)
977 enable_x2apic();
978
979 /*
980 * set up and load the per-CPU TSS
981 */
982 if (!orig_ist->ist[0]) {
983 static const unsigned int order[N_EXCEPTION_STACKS] = {
984 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
985 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
986 };
987 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
988 if (cpu) {
989 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
990 if (!estacks)
991 panic("Cannot allocate exception "
992 "stack %ld %d\n", v, cpu);
993 }
994 estacks += PAGE_SIZE << order[v];
995 orig_ist->ist[v] = t->x86_tss.ist[v] =
996 (unsigned long)estacks;
997 }
998 }
999
1000 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1001 /*
1002 * <= is required because the CPU will access up to
1003 * 8 bits beyond the end of the IO permission bitmap.
1004 */
1005 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1006 t->io_bitmap[i] = ~0UL;
1007
1008 atomic_inc(&init_mm.mm_count);
1009 me->active_mm = &init_mm;
1010 if (me->mm)
1011 BUG();
1012 enter_lazy_tlb(&init_mm, me);
1013
1014 load_sp0(t, &current->thread);
1015 set_tss_desc(cpu, t);
1016 load_TR_desc();
1017 load_LDT(&init_mm.context);
1018
1019#ifdef CONFIG_KGDB
1020 /*
1021 * If the kgdb is connected no debug regs should be altered. This
1022 * is only applicable when KGDB and a KGDB I/O module are built
1023 * into the kernel and you are using early debugging with
1024 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1025 */
1026 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1027 arch_kgdb_ops.correct_hw_break();
1028 else {
1029#endif
1030 /*
1031 * Clear all 6 debug registers:
1032 */
1033
1034 set_debugreg(0UL, 0);
1035 set_debugreg(0UL, 1);
1036 set_debugreg(0UL, 2);
1037 set_debugreg(0UL, 3);
1038 set_debugreg(0UL, 6);
1039 set_debugreg(0UL, 7);
1040#ifdef CONFIG_KGDB
1041 /* If the kgdb is connected no debug regs should be altered. */
1042 }
1043#endif
1044
1045 fpu_init();
1046
1047 raw_local_save_flags(kernel_eflags);
1048
1049 if (is_uv_system())
1050 uv_cpu_init();
1051}
1052
1053#else
1054
726void __cpuinit cpu_init(void) 1055void __cpuinit cpu_init(void)
727{ 1056{
728 int cpu = smp_processor_id(); 1057 int cpu = smp_processor_id();
@@ -803,3 +1132,5 @@ void __cpuinit cpu_uninit(void)
803 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; 1132 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
804} 1133}
805#endif 1134#endif
1135
1136#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index bcb48ce05d23..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,816 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/delay.h>
11#include <linux/smp.h>
12#include <linux/percpu.h>
13#include <asm/i387.h>
14#include <asm/msr.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h>
18#include <asm/mtrr.h>
19#include <asm/mce.h>
20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
23#ifdef CONFIG_X86_LOCAL_APIC
24#include <asm/mpspec.h>
25#include <asm/apic.h>
26#include <mach_apic.h>
27#endif
28#include <asm/pda.h>
29#include <asm/pgtable.h>
30#include <asm/processor.h>
31#include <asm/desc.h>
32#include <asm/atomic.h>
33#include <asm/proto.h>
34#include <asm/sections.h>
35#include <asm/setup.h>
36#include <asm/genapic.h>
37
38#include "cpu.h"
39
40static struct cpu_dev *this_cpu __cpuinitdata;
41
42/* We need valid kernel segments for data and code in long mode too
43 * IRET will check the segment types kkeil 2000/10/28
44 * Also sysret mandates a special GDT layout
45 */
46/* The TLS descriptors are currently at a different place compared to i386.
47 Hopefully nobody expects them at a fixed place (Wine?) */
48DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
49 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
50 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
51 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
52 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
53 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
54 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
55} };
56EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
57
58__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
59
60/* Current gdt points %fs at the "master" per-cpu area: after this,
61 * it's on the real one. */
62void switch_to_new_gdt(void)
63{
64 struct desc_ptr gdt_descr;
65
66 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
67 gdt_descr.size = GDT_SIZE - 1;
68 load_gdt(&gdt_descr);
69}
70
71static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
72
73static void __cpuinit default_init(struct cpuinfo_x86 *c)
74{
75 display_cacheinfo(c);
76}
77
78static struct cpu_dev __cpuinitdata default_cpu = {
79 .c_init = default_init,
80 .c_vendor = "Unknown",
81 .c_x86_vendor = X86_VENDOR_UNKNOWN,
82};
83
84int __cpuinit get_model_name(struct cpuinfo_x86 *c)
85{
86 unsigned int *v;
87 char *p, *q;
88
89 if (c->extended_cpuid_level < 0x80000004)
90 return 0;
91
92 v = (unsigned int *) c->x86_model_id;
93 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
94 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
95 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
96 c->x86_model_id[48] = 0;
97
98 /* Intel chips right-justify this string for some dumb reason;
99 undo that brain damage */
100 p = q = &c->x86_model_id[0];
101 while (*p == ' ')
102 p++;
103 if (p != q) {
104 while (*p)
105 *q++ = *p++;
106 while (q <= &c->x86_model_id[48])
107 *q++ = '\0'; /* Zero-pad the rest */
108 }
109
110 return 1;
111}
112
113
114void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
115{
116 unsigned int n, dummy, ebx, ecx, edx, l2size;
117
118 n = c->extended_cpuid_level;
119
120 if (n >= 0x80000005) {
121 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
122 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
123 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
124 c->x86_cache_size = (ecx>>24) + (edx>>24);
125 /* On K8 L1 TLB is inclusive, so don't count it */
126 c->x86_tlbsize = 0;
127 }
128
129 if (n < 0x80000006) /* Some chips just has a large L1. */
130 return;
131
132 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
133 l2size = ecx >> 16;
134 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
135
136 c->x86_cache_size = l2size;
137
138 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
139 l2size, ecx & 0xFF);
140}
141
142void __cpuinit detect_ht(struct cpuinfo_x86 *c)
143{
144#ifdef CONFIG_SMP
145 u32 eax, ebx, ecx, edx;
146 int index_msb, core_bits;
147
148 if (!cpu_has(c, X86_FEATURE_HT))
149 return;
150 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
151 goto out;
152
153 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
154 return;
155
156 cpuid(1, &eax, &ebx, &ecx, &edx);
157
158 smp_num_siblings = (ebx & 0xff0000) >> 16;
159
160 if (smp_num_siblings == 1) {
161 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
162 } else if (smp_num_siblings > 1) {
163
164 if (smp_num_siblings > NR_CPUS) {
165 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
166 smp_num_siblings);
167 smp_num_siblings = 1;
168 return;
169 }
170
171 index_msb = get_count_order(smp_num_siblings);
172 c->phys_proc_id = phys_pkg_id(index_msb);
173
174 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
175
176 index_msb = get_count_order(smp_num_siblings);
177
178 core_bits = get_count_order(c->x86_max_cores);
179
180 c->cpu_core_id = phys_pkg_id(index_msb) &
181 ((1 << core_bits) - 1);
182 }
183
184out:
185 if ((c->x86_max_cores * smp_num_siblings) > 1) {
186 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
187 c->phys_proc_id);
188 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
189 c->cpu_core_id);
190 }
191#endif
192}
193
194static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
195{
196 char *v = c->x86_vendor_id;
197 int i;
198 static int printed;
199
200 for (i = 0; i < X86_VENDOR_NUM; i++) {
201 if (!cpu_devs[i])
202 break;
203
204 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
205 (cpu_devs[i]->c_ident[1] &&
206 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
207 this_cpu = cpu_devs[i];
208 c->x86_vendor = this_cpu->c_x86_vendor;
209 return;
210 }
211 }
212
213 if (!printed) {
214 printed++;
215 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
216 printk(KERN_ERR "CPU: Your system may be unstable.\n");
217 }
218
219 c->x86_vendor = X86_VENDOR_UNKNOWN;
220 this_cpu = &default_cpu;
221}
222
223void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
224{
225 /* Get vendor name */
226 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
227 (unsigned int *)&c->x86_vendor_id[0],
228 (unsigned int *)&c->x86_vendor_id[8],
229 (unsigned int *)&c->x86_vendor_id[4]);
230
231 c->x86 = 4;
232 /* Intel-defined flags: level 0x00000001 */
233 if (c->cpuid_level >= 0x00000001) {
234 u32 junk, tfms, cap0, misc;
235 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
236 c->x86 = (tfms >> 8) & 0xf;
237 c->x86_model = (tfms >> 4) & 0xf;
238 c->x86_mask = tfms & 0xf;
239 if (c->x86 == 0xf)
240 c->x86 += (tfms >> 20) & 0xff;
241 if (c->x86 >= 0x6)
242 c->x86_model += ((tfms >> 16) & 0xf) << 4;
243 if (cap0 & (1<<19)) {
244 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
245 c->x86_cache_alignment = c->x86_clflush_size;
246 }
247 }
248}
249
250
251static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
252{
253 u32 tfms, xlvl;
254 u32 ebx;
255
256 /* Intel-defined flags: level 0x00000001 */
257 if (c->cpuid_level >= 0x00000001) {
258 u32 capability, excap;
259
260 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
261 c->x86_capability[0] = capability;
262 c->x86_capability[4] = excap;
263 }
264
265 /* AMD-defined flags: level 0x80000001 */
266 xlvl = cpuid_eax(0x80000000);
267 c->extended_cpuid_level = xlvl;
268 if ((xlvl & 0xffff0000) == 0x80000000) {
269 if (xlvl >= 0x80000001) {
270 c->x86_capability[1] = cpuid_edx(0x80000001);
271 c->x86_capability[6] = cpuid_ecx(0x80000001);
272 }
273 }
274
275 /* Transmeta-defined flags: level 0x80860001 */
276 xlvl = cpuid_eax(0x80860000);
277 if ((xlvl & 0xffff0000) == 0x80860000) {
278 /* Don't set x86_cpuid_level here for now to not confuse. */
279 if (xlvl >= 0x80860001)
280 c->x86_capability[2] = cpuid_edx(0x80860001);
281 }
282
283 if (c->extended_cpuid_level >= 0x80000007)
284 c->x86_power = cpuid_edx(0x80000007);
285
286 if (c->extended_cpuid_level >= 0x80000008) {
287 u32 eax = cpuid_eax(0x80000008);
288
289 c->x86_virt_bits = (eax >> 8) & 0xff;
290 c->x86_phys_bits = eax & 0xff;
291 }
292}
293
294/* Do some early cpuid on the boot CPU to get some parameter that are
295 needed before check_bugs. Everything advanced is in identify_cpu
296 below. */
297static void __init early_identify_cpu(struct cpuinfo_x86 *c)
298{
299
300 c->x86_clflush_size = 64;
301 c->x86_cache_alignment = c->x86_clflush_size;
302
303 memset(&c->x86_capability, 0, sizeof c->x86_capability);
304
305 c->extended_cpuid_level = 0;
306
307 cpu_detect(c);
308
309 get_cpu_vendor(c);
310
311 get_cpu_cap(c);
312
313 if (this_cpu->c_early_init)
314 this_cpu->c_early_init(c);
315
316 validate_pat_support(c);
317}
318
319void __init early_cpu_init(void)
320{
321 struct cpu_dev **cdev;
322 int count = 0;
323
324 printk("KERNEL supported cpus:\n");
325 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
326 struct cpu_dev *cpudev = *cdev;
327 unsigned int j;
328
329 if (count >= X86_VENDOR_NUM)
330 break;
331 cpu_devs[count] = cpudev;
332 count++;
333
334 for (j = 0; j < 2; j++) {
335 if (!cpudev->c_ident[j])
336 continue;
337 printk(" %s %s\n", cpudev->c_vendor,
338 cpudev->c_ident[j]);
339 }
340 }
341
342 early_identify_cpu(&boot_cpu_data);
343}
344
345/*
346 * The NOPL instruction is supposed to exist on all CPUs with
347 * family >= 6, unfortunately, that's not true in practice because
348 * of early VIA chips and (more importantly) broken virtualizers that
349 * are not easy to detect. Hence, probe for it based on first
350 * principles.
351 *
352 * Note: no 64-bit chip is known to lack these, but put the code here
353 * for consistency with 32 bits, and to make it utterly trivial to
354 * diagnose the problem should it ever surface.
355 */
356static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
357{
358 const u32 nopl_signature = 0x888c53b1; /* Random number */
359 u32 has_nopl = nopl_signature;
360
361 clear_cpu_cap(c, X86_FEATURE_NOPL);
362 if (c->x86 >= 6) {
363 asm volatile("\n"
364 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
365 "2:\n"
366 " .section .fixup,\"ax\"\n"
367 "3: xor %0,%0\n"
368 " jmp 2b\n"
369 " .previous\n"
370 _ASM_EXTABLE(1b,3b)
371 : "+a" (has_nopl));
372
373 if (has_nopl == nopl_signature)
374 set_cpu_cap(c, X86_FEATURE_NOPL);
375 }
376}
377
378static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
379{
380 c->extended_cpuid_level = 0;
381
382 cpu_detect(c);
383
384 get_cpu_vendor(c);
385
386 get_cpu_cap(c);
387
388 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
389#ifdef CONFIG_SMP
390 c->phys_proc_id = c->initial_apicid;
391#endif
392
393 if (c->extended_cpuid_level >= 0x80000004)
394 get_model_name(c); /* Default name */
395
396 init_scattered_cpuid_features(c);
397 detect_nopl(c);
398}
399
400/*
401 * This does the hard work of actually picking apart the CPU stuff...
402 */
403static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
404{
405 int i;
406
407 c->loops_per_jiffy = loops_per_jiffy;
408 c->x86_cache_size = -1;
409 c->x86_vendor = X86_VENDOR_UNKNOWN;
410 c->x86_model = c->x86_mask = 0; /* So far unknown... */
411 c->x86_vendor_id[0] = '\0'; /* Unset */
412 c->x86_model_id[0] = '\0'; /* Unset */
413 c->x86_max_cores = 1;
414 c->x86_coreid_bits = 0;
415 c->x86_clflush_size = 64;
416 c->x86_cache_alignment = c->x86_clflush_size;
417 memset(&c->x86_capability, 0, sizeof c->x86_capability);
418
419 generic_identify(c);
420
421 c->apicid = phys_pkg_id(0);
422
423 /*
424 * Vendor-specific initialization. In this section we
425 * canonicalize the feature flags, meaning if there are
426 * features a certain CPU supports which CPUID doesn't
427 * tell us, CPUID claiming incorrect flags, or other bugs,
428 * we handle them here.
429 *
430 * At the end of this section, c->x86_capability better
431 * indicate the features this CPU genuinely supports!
432 */
433 if (this_cpu->c_init)
434 this_cpu->c_init(c);
435
436 detect_ht(c);
437
438 /*
439 * On SMP, boot_cpu_data holds the common feature set between
440 * all CPUs; so make sure that we indicate which features are
441 * common between the CPUs. The first time this routine gets
442 * executed, c == &boot_cpu_data.
443 */
444 if (c != &boot_cpu_data) {
445 /* AND the already accumulated flags with these */
446 for (i = 0; i < NCAPINTS; i++)
447 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
448 }
449
450 /* Clear all flags overriden by options */
451 for (i = 0; i < NCAPINTS; i++)
452 c->x86_capability[i] &= ~cleared_cpu_caps[i];
453
454#ifdef CONFIG_X86_MCE
455 mcheck_init(c);
456#endif
457 select_idle_routine(c);
458
459#ifdef CONFIG_NUMA
460 numa_add_cpu(smp_processor_id());
461#endif
462
463}
464
465void __init identify_boot_cpu(void)
466{
467 identify_cpu(&boot_cpu_data);
468}
469
470void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
471{
472 BUG_ON(c == &boot_cpu_data);
473 identify_cpu(c);
474 mtrr_ap_init();
475}
476
477struct msr_range {
478 unsigned min;
479 unsigned max;
480};
481
482static struct msr_range msr_range_array[] __cpuinitdata = {
483 { 0x00000000, 0x00000418},
484 { 0xc0000000, 0xc000040b},
485 { 0xc0010000, 0xc0010142},
486 { 0xc0011000, 0xc001103b},
487};
488
489static void __cpuinit print_cpu_msr(void)
490{
491 unsigned index;
492 u64 val;
493 int i;
494 unsigned index_min, index_max;
495
496 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
497 index_min = msr_range_array[i].min;
498 index_max = msr_range_array[i].max;
499 for (index = index_min; index < index_max; index++) {
500 if (rdmsrl_amd_safe(index, &val))
501 continue;
502 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
503 }
504 }
505}
506
507static int show_msr __cpuinitdata;
508static __init int setup_show_msr(char *arg)
509{
510 int num;
511
512 get_option(&arg, &num);
513
514 if (num > 0)
515 show_msr = num;
516 return 1;
517}
518__setup("show_msr=", setup_show_msr);
519
520static __init int setup_noclflush(char *arg)
521{
522 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
523 return 1;
524}
525__setup("noclflush", setup_noclflush);
526
527void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
528{
529 if (c->x86_model_id[0])
530 printk(KERN_CONT "%s", c->x86_model_id);
531
532 if (c->x86_mask || c->cpuid_level >= 0)
533 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
534 else
535 printk(KERN_CONT "\n");
536
537#ifdef CONFIG_SMP
538 if (c->cpu_index < show_msr)
539 print_cpu_msr();
540#else
541 if (show_msr)
542 print_cpu_msr();
543#endif
544}
545
546static __init int setup_disablecpuid(char *arg)
547{
548 int bit;
549 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
550 setup_clear_cpu_cap(bit);
551 else
552 return 0;
553 return 1;
554}
555__setup("clearcpuid=", setup_disablecpuid);
556
557cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
558
559struct x8664_pda **_cpu_pda __read_mostly;
560EXPORT_SYMBOL(_cpu_pda);
561
562struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
563
564char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
565
566unsigned long __supported_pte_mask __read_mostly = ~0UL;
567EXPORT_SYMBOL_GPL(__supported_pte_mask);
568
569static int do_not_nx __cpuinitdata;
570
571/* noexec=on|off
572Control non executable mappings for 64bit processes.
573
574on Enable(default)
575off Disable
576*/
577static int __init nonx_setup(char *str)
578{
579 if (!str)
580 return -EINVAL;
581 if (!strncmp(str, "on", 2)) {
582 __supported_pte_mask |= _PAGE_NX;
583 do_not_nx = 0;
584 } else if (!strncmp(str, "off", 3)) {
585 do_not_nx = 1;
586 __supported_pte_mask &= ~_PAGE_NX;
587 }
588 return 0;
589}
590early_param("noexec", nonx_setup);
591
592int force_personality32;
593
594/* noexec32=on|off
595Control non executable heap for 32bit processes.
596To control the stack too use noexec=off
597
598on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
599off PROT_READ implies PROT_EXEC
600*/
601static int __init nonx32_setup(char *str)
602{
603 if (!strcmp(str, "on"))
604 force_personality32 &= ~READ_IMPLIES_EXEC;
605 else if (!strcmp(str, "off"))
606 force_personality32 |= READ_IMPLIES_EXEC;
607 return 1;
608}
609__setup("noexec32=", nonx32_setup);
610
611void pda_init(int cpu)
612{
613 struct x8664_pda *pda = cpu_pda(cpu);
614
615 /* Setup up data that may be needed in __get_free_pages early */
616 loadsegment(fs, 0);
617 loadsegment(gs, 0);
618 /* Memory clobbers used to order PDA accessed */
619 mb();
620 wrmsrl(MSR_GS_BASE, pda);
621 mb();
622
623 pda->cpunumber = cpu;
624 pda->irqcount = -1;
625 pda->kernelstack = (unsigned long)stack_thread_info() -
626 PDA_STACKOFFSET + THREAD_SIZE;
627 pda->active_mm = &init_mm;
628 pda->mmu_state = 0;
629
630 if (cpu == 0) {
631 /* others are initialized in smpboot.c */
632 pda->pcurrent = &init_task;
633 pda->irqstackptr = boot_cpu_stack;
634 pda->irqstackptr += IRQSTACKSIZE - 64;
635 } else {
636 if (!pda->irqstackptr) {
637 pda->irqstackptr = (char *)
638 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
639 if (!pda->irqstackptr)
640 panic("cannot allocate irqstack for cpu %d",
641 cpu);
642 pda->irqstackptr += IRQSTACKSIZE - 64;
643 }
644
645 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
646 pda->nodenumber = cpu_to_node(cpu);
647 }
648}
649
650char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
651 DEBUG_STKSZ] __page_aligned_bss;
652
653extern asmlinkage void ignore_sysret(void);
654
655/* May not be marked __init: used by software suspend */
656void syscall_init(void)
657{
658 /*
659 * LSTAR and STAR live in a bit strange symbiosis.
660 * They both write to the same internal register. STAR allows to
661 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
662 */
663 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
664 wrmsrl(MSR_LSTAR, system_call);
665 wrmsrl(MSR_CSTAR, ignore_sysret);
666
667#ifdef CONFIG_IA32_EMULATION
668 syscall32_cpu_init();
669#endif
670
671 /* Flags to clear on syscall */
672 wrmsrl(MSR_SYSCALL_MASK,
673 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
674}
675
676void __cpuinit check_efer(void)
677{
678 unsigned long efer;
679
680 rdmsrl(MSR_EFER, efer);
681 if (!(efer & EFER_NX) || do_not_nx)
682 __supported_pte_mask &= ~_PAGE_NX;
683}
684
685unsigned long kernel_eflags;
686
687/*
688 * Copies of the original ist values from the tss are only accessed during
689 * debugging, no special alignment required.
690 */
691DEFINE_PER_CPU(struct orig_ist, orig_ist);
692
693/*
694 * cpu_init() initializes state that is per-CPU. Some data is already
695 * initialized (naturally) in the bootstrap process, such as the GDT
696 * and IDT. We reload them nevertheless, this function acts as a
697 * 'CPU state barrier', nothing should get across.
698 * A lot of state is already set up in PDA init.
699 */
700void __cpuinit cpu_init(void)
701{
702 int cpu = stack_smp_processor_id();
703 struct tss_struct *t = &per_cpu(init_tss, cpu);
704 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
705 unsigned long v;
706 char *estacks = NULL;
707 struct task_struct *me;
708 int i;
709
710 /* CPU 0 is initialised in head64.c */
711 if (cpu != 0)
712 pda_init(cpu);
713 else
714 estacks = boot_exception_stacks;
715
716 me = current;
717
718 if (cpu_test_and_set(cpu, cpu_initialized))
719 panic("CPU#%d already initialized!\n", cpu);
720
721 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
722
723 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
724
725 /*
726 * Initialize the per-CPU GDT with the boot GDT,
727 * and set up the GDT descriptor:
728 */
729
730 switch_to_new_gdt();
731 load_idt((const struct desc_ptr *)&idt_descr);
732
733 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
734 syscall_init();
735
736 wrmsrl(MSR_FS_BASE, 0);
737 wrmsrl(MSR_KERNEL_GS_BASE, 0);
738 barrier();
739
740 check_efer();
741 if (cpu != 0 && x2apic)
742 enable_x2apic();
743
744 /*
745 * set up and load the per-CPU TSS
746 */
747 if (!orig_ist->ist[0]) {
748 static const unsigned int order[N_EXCEPTION_STACKS] = {
749 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
750 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
751 };
752 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
753 if (cpu) {
754 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
755 if (!estacks)
756 panic("Cannot allocate exception "
757 "stack %ld %d\n", v, cpu);
758 }
759 estacks += PAGE_SIZE << order[v];
760 orig_ist->ist[v] = t->x86_tss.ist[v] =
761 (unsigned long)estacks;
762 }
763 }
764
765 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
766 /*
767 * <= is required because the CPU will access up to
768 * 8 bits beyond the end of the IO permission bitmap.
769 */
770 for (i = 0; i <= IO_BITMAP_LONGS; i++)
771 t->io_bitmap[i] = ~0UL;
772
773 atomic_inc(&init_mm.mm_count);
774 me->active_mm = &init_mm;
775 if (me->mm)
776 BUG();
777 enter_lazy_tlb(&init_mm, me);
778
779 load_sp0(t, &current->thread);
780 set_tss_desc(cpu, t);
781 load_TR_desc();
782 load_LDT(&init_mm.context);
783
784#ifdef CONFIG_KGDB
785 /*
786 * If the kgdb is connected no debug regs should be altered. This
787 * is only applicable when KGDB and a KGDB I/O module are built
788 * into the kernel and you are using early debugging with
789 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
790 */
791 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
792 arch_kgdb_ops.correct_hw_break();
793 else {
794#endif
795 /*
796 * Clear all 6 debug registers:
797 */
798
799 set_debugreg(0UL, 0);
800 set_debugreg(0UL, 1);
801 set_debugreg(0UL, 2);
802 set_debugreg(0UL, 3);
803 set_debugreg(0UL, 6);
804 set_debugreg(0UL, 7);
805#ifdef CONFIG_KGDB
806 /* If the kgdb is connected no debug regs should be altered. */
807 }
808#endif
809
810 fpu_init();
811
812 raw_local_save_flags(kernel_eflags);
813
814 if (is_uv_system())
815 uv_cpu_init();
816}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3cc9d92afd8f..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -31,7 +31,6 @@ struct cpu_dev {
31 31
32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
33 33
34extern int get_model_name(struct cpuinfo_x86 *c);
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
36 35
37#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd097b835839..c24c4a487b7c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
257 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
258 */ 258 */
259static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
260{ 261{
261 union { 262 union {
262 struct { 263 struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
326 327
327#endif 328#endif
328 329
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
330 331
331 put_cpu(); 332 put_cpu();
332 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -785,7 +786,11 @@ static int __init acpi_cpufreq_init(void)
785 if (ret) 786 if (ret)
786 return ret; 787 return ret;
787 788
788 return cpufreq_register_driver(&acpi_cpufreq_driver); 789 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
790 if (ret)
791 free_percpu(acpi_perf_data);
792
793 return ret;
789} 794}
790 795
791static void __exit acpi_cpufreq_exit(void) 796static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +800,6 @@ static void __exit acpi_cpufreq_exit(void)
795 cpufreq_unregister_driver(&acpi_cpufreq_driver); 800 cpufreq_unregister_driver(&acpi_cpufreq_driver);
796 801
797 free_percpu(acpi_perf_data); 802 free_percpu(acpi_perf_data);
798
799 return;
800} 803}
801 804
802module_param(acpi_pstate_strict, uint, 0644); 805module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index e4a4bf870e94..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fbd..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..b5ced806a316 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 15e13c01cc36..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) \ 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3f8c7283d816..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -301,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
301 */ 301 */
302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
303 geode_configure(); 303 geode_configure();
304 get_model_name(c); /* get CPU marketing name */
305 return; 304 return;
306 } else { /* MediaGX */ 305 } else { /* MediaGX */
307 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 959417b8cd64..99468dbd08da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -25,14 +30,20 @@
25 30
26static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
27{ 32{
28 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
29 if (c->x86 == 15 && c->x86_cache_alignment == 64)
30 c->x86_cache_alignment = 128;
31 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
32 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
33 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
34} 44}
35 45
46#ifdef CONFIG_X86_32
36/* 47/*
37 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
38 * 49 *
@@ -52,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
52 return 0; 63 return 0;
53} 64}
54 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
55 70
56/* 71 /*
57 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
58 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
59 */ 74 */
60static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
61{ 81{
62 unsigned long lo, hi; 82 unsigned long lo, hi;
63 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
64 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
65 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
66 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -70,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
70 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
71 } 121 }
72 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
73} 160}
161#endif
74 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
75 180
76/* 181/*
77 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
78 */ 183 */
79static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
80{ 185{
81 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
82 187
@@ -91,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
91 return 1; 196 return 1;
92} 197}
93 198
94#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
95static void __cpuinit trap_init_f00f_bug(void)
96{ 200{
97 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
98 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
99 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
100 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
101 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
102 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
103 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
104 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
105} 235}
106#endif
107 236
108static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
109{ 238{
110 unsigned int l2 = 0; 239 unsigned int l2 = 0;
111 char *p = NULL;
112 240
113 early_init_intel(c); 241 early_init_intel(c);
114 242
115#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
116 /*
117 * All current models of Pentium and Pentium with MMX technology CPUs
118 * have the F0 0F bug, which lets nonprivileged users lock up the system.
119 * Note that the workaround only should be initialized once...
120 */
121 c->f00f_bug = 0;
122 if (!paravirt_enabled() && c->x86 == 5) {
123 static int f00f_workaround_enabled;
124
125 c->f00f_bug = 1;
126 if (!f00f_workaround_enabled) {
127 trap_init_f00f_bug();
128 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
129 f00f_workaround_enabled = 1;
130 }
131 }
132#endif
133 244
134 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
135 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -139,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
139 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
140 } 251 }
141 252
142 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
143 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
144 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
145 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
146 /* 271 /*
147 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
148 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
149 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
150 */ 275 */
151 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
152 switch (c->x86_model) { 279 switch (c->x86_model) {
153 case 5: 280 case 5:
154 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -171,77 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
171 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
172 break; 299 break;
173 } 300 }
301
302 if (p)
303 strcpy(c->x86_model_id, p);
174 } 304 }
175 305
176 if (p) 306 if (c->x86 == 15)
177 strcpy(c->x86_model_id, p); 307 set_cpu_cap(c, X86_FEATURE_P4);
308 if (c->x86 == 6)
309 set_cpu_cap(c, X86_FEATURE_P3);
178 310
179 detect_extended_topology(c); 311 if (cpu_has_bts)
312 ptrace_bts_init_intel(c);
180 313
314#endif
315
316 detect_extended_topology(c);
181 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
182 /* 318 /*
183 * let's use the legacy cpuid vector 0x1 and 0x4 for topology 319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
184 * detection. 320 * detection.
185 */ 321 */
186 c->x86_max_cores = num_cpu_cores(c); 322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
187 detect_ht(c); 324 detect_ht(c);
188 }
189
190 /* Work around errata */
191 Intel_errata_workarounds(c);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 }
209#endif 325#endif
210
211 if (cpu_has_xmm2)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4);
215 } 326 }
216 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 }
226
227 if (cpu_has_bts)
228 ds_init_intel(c);
229 327
230 /* 328 /* Work around errata */
231 * See if we have a good local APIC by checking for buggy Pentia, 329 srat_detect_node();
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239 330
240#ifdef CONFIG_X86_NUMAQ 331 if (cpu_has(c, X86_FEATURE_VMX))
241 numaq_tsc_disable(); 332 detect_vmx_virtcap(c);
242#endif
243} 333}
244 334
335#ifdef CONFIG_X86_32
245static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
246{ 337{
247 /* 338 /*
@@ -254,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
254 size = 256; 345 size = 256;
255 return size; 346 return size;
256} 347}
348#endif
257 349
258static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
259 .c_vendor = "Intel", 351 .c_vendor = "Intel",
260 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
261 .c_models = { 354 .c_models = {
262 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
263 { 356 {
@@ -307,13 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
307 } 400 }
308 }, 401 },
309 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
310 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
311 .c_init = init_intel, 406 .c_init = init_intel,
312 .c_size_cache = intel_size_cache,
313 .c_x86_vendor = X86_VENDOR_INTEL, 407 .c_x86_vendor = X86_VENDOR_INTEL,
314}; 408};
315 409
316cpu_dev_register(intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
317 411
318/* arch_initcall(intel_cpu_init); */
319
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 0c0a58dfe099..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,99 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83
84 detect_extended_topology(c);
85 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY))
86 c->x86_max_cores = intel_num_cpu_cores(c);
87
88 srat_detect_node();
89}
90
91static struct cpu_dev intel_cpu_dev __cpuinitdata = {
92 .c_vendor = "Intel",
93 .c_ident = { "GenuineIntel" },
94 .c_early_init = early_init_intel,
95 .c_init = init_intel,
96 .c_x86_vendor = X86_VENDOR_INTEL,
97};
98
99cpu_dev_register(intel_cpu_dev);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 726a5fcdf341..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -860,7 +860,7 @@ error:
860 return err; 860 return err;
861} 861}
862 862
863static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
864{ 864{
865 int i; 865 int i;
866 866
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80eb..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
401 tmp |= ~((1<<(hi - 1)) - 1); 401 tmp |= ~((1<<(hi - 1)) - 1);
402 402
403 if (tmp != mask_lo) { 403 if (tmp != mask_lo) {
404 static int once = 1; 404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405
406 if (once) {
407 printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
408 once = 0;
409 }
410 mask_lo = tmp; 405 mask_lo = tmp;
411 } 406 }
412 } 407 }
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 58ac5d3d4361..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4ff..6bff382094f5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 295 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 296 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 298
299 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 300 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 301 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 302 wd->cccr_msr = 0; /* unused */
303
304 /* ok, everything is initialized, announce that we're set */
305 cpu_nmi_set_wd_enabled();
306
307 apic_write(APIC_LVTPC, APIC_DM_NMI);
308 evntsel |= K7_EVNTSEL_ENABLE;
309 wrmsr(evntsel_msr, evntsel, 0);
310
305 return 1; 311 return 1;
306} 312}
307 313
@@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 385 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 386 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 387 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 388
389 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 390 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 391 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 392 wd->cccr_msr = 0; /* unused */
393
394 /* ok, everything is initialized, announce that we're set */
395 cpu_nmi_set_wd_enabled();
396
397 apic_write(APIC_LVTPC, APIC_DM_NMI);
398 evntsel |= P6_EVNTSEL0_ENABLE;
399 wrmsr(evntsel_msr, evntsel, 0);
400
389 return 1; 401 return 1;
390} 402}
391 403
@@ -432,6 +444,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 444#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 445#define P4_CCCR_OVF (1 << 31)
434 446
447#define P4_CONTROLS 18
448static unsigned int p4_controls[18] = {
449 MSR_P4_BPU_CCCR0,
450 MSR_P4_BPU_CCCR1,
451 MSR_P4_BPU_CCCR2,
452 MSR_P4_BPU_CCCR3,
453 MSR_P4_MS_CCCR0,
454 MSR_P4_MS_CCCR1,
455 MSR_P4_MS_CCCR2,
456 MSR_P4_MS_CCCR3,
457 MSR_P4_FLAME_CCCR0,
458 MSR_P4_FLAME_CCCR1,
459 MSR_P4_FLAME_CCCR2,
460 MSR_P4_FLAME_CCCR3,
461 MSR_P4_IQ_CCCR0,
462 MSR_P4_IQ_CCCR1,
463 MSR_P4_IQ_CCCR2,
464 MSR_P4_IQ_CCCR3,
465 MSR_P4_IQ_CCCR4,
466 MSR_P4_IQ_CCCR5,
467};
435/* 468/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 469 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 470 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +506,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 506 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 507 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 508 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
509
510 /*
511 * If we're on the kdump kernel or other situation, we may
512 * still have other performance counter registers set to
513 * interrupt and they'll keep interrupting forever because
514 * of the P4_CCCR_OVF quirk. So we need to ACK all the
515 * pending interrupts and disable all the registers here,
516 * before reenabling the NMI delivery. Refer to p4_rearm()
517 * about the P4_CCCR_OVF quirk.
518 */
519 if (reset_devices) {
520 unsigned int low, high;
521 int i;
522
523 for (i = 0; i < P4_CONTROLS; i++) {
524 rdmsr(p4_controls[i], low, high);
525 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
526 wrmsr(p4_controls[i], low, high);
527 }
528 }
476 } else { 529 } else {
477 /* logical cpu 1 */ 530 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 531 perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
499 wrmsr(evntsel_msr, evntsel, 0); 552 wrmsr(evntsel_msr, evntsel, 0);
500 wrmsr(cccr_msr, cccr_val, 0); 553 wrmsr(cccr_msr, cccr_val, 0);
501 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 554 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
502 apic_write(APIC_LVTPC, APIC_DM_NMI); 555
503 cccr_val |= P4_CCCR_ENABLE;
504 wrmsr(cccr_msr, cccr_val, 0);
505 wd->perfctr_msr = perfctr_msr; 556 wd->perfctr_msr = perfctr_msr;
506 wd->evntsel_msr = evntsel_msr; 557 wd->evntsel_msr = evntsel_msr;
507 wd->cccr_msr = cccr_msr; 558 wd->cccr_msr = cccr_msr;
559
560 /* ok, everything is initialized, announce that we're set */
561 cpu_nmi_set_wd_enabled();
562
563 apic_write(APIC_LVTPC, APIC_DM_NMI);
564 cccr_val |= P4_CCCR_ENABLE;
565 wrmsr(cccr_msr, cccr_val, 0);
508 return 1; 566 return 1;
509} 567}
510 568
@@ -620,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
620 wrmsr(evntsel_msr, evntsel, 0); 678 wrmsr(evntsel_msr, evntsel, 0);
621 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 679 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
622 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 680 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
623 apic_write(APIC_LVTPC, APIC_DM_NMI);
624 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
625 wrmsr(evntsel_msr, evntsel, 0);
626 681
627 wd->perfctr_msr = perfctr_msr; 682 wd->perfctr_msr = perfctr_msr;
628 wd->evntsel_msr = evntsel_msr; 683 wd->evntsel_msr = evntsel_msr;
629 wd->cccr_msr = 0; /* unused */ 684 wd->cccr_msr = 0; /* unused */
685
686 /* ok, everything is initialized, announce that we're set */
687 cpu_nmi_set_wd_enabled();
688
689 apic_write(APIC_LVTPC, APIC_DM_NMI);
690 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
691 wrmsr(evntsel_msr, evntsel, 0);
630 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 692 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
631 return 1; 693 return 1;
632} 694}
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index 7c46e6ecedca..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,11 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify,
105 .c_x86_vendor = X86_VENDOR_TRANSMETA, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
106}; 107};
107 108
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8e9cd6a8ec12..6a44d6465991 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/major.h> 37#include <linux/major.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/smp_lock.h>
40#include <linux/device.h> 39#include <linux/device.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..e90a60ef10c2 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,8 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10 10#include <linux/uaccess.h>
11#include <asm/uaccess.h> 11#include <linux/io.h>
12#include <asm/io.h>
13 12
14/** 13/**
15 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +24,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 24 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 25 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 26ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 27 size_t csize, unsigned long offset, int userbuf)
29{ 28{
30 void *vaddr; 29 void *vaddr;
31 30
@@ -33,14 +32,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 32 return 0;
34 33
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
35 if (!vaddr)
36 return -ENOMEM;
36 37
37 if (userbuf) { 38 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 39 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 40 iounmap(vaddr);
40 return -EFAULT; 41 return -EFAULT;
41 } 42 }
42 } else 43 } else
43 memcpy(buf, (vaddr + offset), csize); 44 memcpy(buf, vaddr + offset, csize);
44 45
45 iounmap(vaddr); 46 iounmap(vaddr);
46 return csize; 47 return csize;
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index e24d1bc47b46..78e642feac30 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1206,7 +1206,7 @@ static int __init parse_memmap_opt(char *p)
1206 if (!p) 1206 if (!p)
1207 return -EINVAL; 1207 return -EINVAL;
1208 1208
1209 if (!strcmp(p, "exactmap")) { 1209 if (!strncmp(p, "exactmap", 8)) {
1210#ifdef CONFIG_CRASH_DUMP 1210#ifdef CONFIG_CRASH_DUMP
1211 /* 1211 /*
1212 * If we are doing a crash dump, we still need to know 1212 * If we are doing a crash dump, we still need to know
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fac..24bb5faf5efa 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,20 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98#ifdef CONFIG_DMAR
99static void __init intel_g33_dmar(int num, int slot, int func)
100{
101 struct acpi_table_header *dmar_tbl;
102 acpi_status status;
103
104 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
105 if (ACPI_SUCCESS(status)) {
106 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
107 dmar_disabled = 1;
108 }
109}
110#endif
111
98#define QFLAG_APPLY_ONCE 0x1 112#define QFLAG_APPLY_ONCE 0x1
99#define QFLAG_APPLIED 0x2 113#define QFLAG_APPLIED 0x2
100#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +128,10 @@ static struct chipset early_qrk[] __initdata = {
114 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
115 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 129 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
116 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 130 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
131#ifdef CONFIG_DMAR
132 { PCI_VENDOR_ID_INTEL, 0x29c0,
133 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
134#endif
117 {} 135 {}
118}; 136};
119 137
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..945a31cdd81f 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -414,9 +414,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 414 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 415 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
417
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 418 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 419 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 420 "Kernel-defined memdesc doesn't match the one from EFI!\n");
421
420 if (add_efi_memmap) 422 if (add_efi_memmap)
421 do_add_efi_memmap(); 423 do_add_efi_memmap();
422 424
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9bfc4d72fb2e..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
108 } 108 }
109 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
110 110
111 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
112 early_printk("Kernel alive\n");
112 113
113 x86_64_init_pda(); 114 x86_64_init_pda();
114 115
115 early_printk("Kernel really alive\n");
116
117 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
118} 117}
119 118
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377a..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
634 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
635 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
636ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
638# if KPMDS == 3 634# if KPMDS == 3
639 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
640 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
641 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
642# elif KPMDS == 2 638# elif KPMDS == 2
643 .long 0,0 639 .long 0,0
644 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
645 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
646# elif KPMDS == 1 642# elif KPMDS == 1
647 .long 0,0 643 .long 0,0
648 .long 0,0 644 .long 0,0
649 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
650# else 646# else
651# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
652# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe886..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 59fd3b6b1303..73deaffadd03 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void)
210 /* Calculate the min / max delta */ 210 /* Calculate the min / max delta */
211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
212 &hpet_clockevent); 212 &hpet_clockevent);
213 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 213 /* 5 usec minimum reprogramming delta. */
214 &hpet_clockevent); 214 hpet_clockevent.min_delta_ns = 5000;
215 215
216 /* 216 /*
217 * Start hpet with the boot cpu mask and make it 217 * Start hpet with the boot cpu mask and make it
@@ -270,15 +270,22 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode,
270} 270}
271 271
272static int hpet_legacy_next_event(unsigned long delta, 272static int hpet_legacy_next_event(unsigned long delta,
273 struct clock_event_device *evt) 273 struct clock_event_device *evt)
274{ 274{
275 unsigned long cnt; 275 u32 cnt;
276 276
277 cnt = hpet_readl(HPET_COUNTER); 277 cnt = hpet_readl(HPET_COUNTER);
278 cnt += delta; 278 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 279 hpet_writel(cnt, HPET_T0_CMP);
280 280
281 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 281 /*
282 * We need to read back the CMP register to make sure that
283 * what we wrote hit the chip before we compare it to the
284 * counter.
285 */
286 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
287
288 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
282} 289}
283 290
284/* 291/*
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 1c3a66a67f83..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc088..b71e02d42f4f 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -325,7 +325,7 @@ skip:
325 for_each_online_cpu(j) 325 for_each_online_cpu(j)
326 seq_printf(p, "%10u ", 326 seq_printf(p, "%10u ",
327 per_cpu(irq_stat,j).irq_call_count); 327 per_cpu(irq_stat,j).irq_call_count);
328 seq_printf(p, " function call interrupts\n"); 328 seq_printf(p, " Function call interrupts\n");
329 seq_printf(p, "TLB: "); 329 seq_printf(p, "TLB: ");
330 for_each_online_cpu(j) 330 for_each_online_cpu(j)
331 seq_printf(p, "%10u ", 331 seq_printf(p, "%10u ",
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..f065fe9071b9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -129,7 +129,7 @@ skip:
129 seq_printf(p, "CAL: "); 129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j) 130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); 131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n"); 132 seq_printf(p, " Function call interrupts\n");
133 seq_printf(p, "TLB: "); 133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j) 134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); 135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index f2d43bc75514..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
139 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
140 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
141 if (!data) { 141 if (!data) {
142 kfree(node);
142 error = -ENXIO; 143 error = -ENXIO;
143 goto err_dir; 144 goto err_dir;
144 } 145 }
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2cc4ad..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
299 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
300} 300}
301 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
302void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
303{ 312{
304 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
311 320
312 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
313 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
314 /* enable it before to avoid race with handler */
315 __get_cpu_var(wd_enabled) = 1;
316 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
317 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
318 return; 325 return;
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index dcdac6c826e9..080d1d27f37a 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -261,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
261 badbit, tbl, start_addr, npages); 261 badbit, tbl, start_addr, npages);
262 } 262 }
263 263
264 set_bit_string(tbl->it_map, index, npages); 264 iommu_area_reserve(tbl->it_map, index, npages);
265 265
266 spin_unlock_irqrestore(&tbl->it_lock, flags); 266 spin_unlock_irqrestore(&tbl->it_lock, flags);
267} 267}
@@ -491,6 +491,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
491 npages = size >> PAGE_SHIFT; 491 npages = size >> PAGE_SHIFT;
492 order = get_order(size); 492 order = get_order(size);
493 493
494 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
495
494 /* alloc enough pages (and possibly more) */ 496 /* alloc enough pages (and possibly more) */
495 ret = (void *)__get_free_pages(flag, order); 497 ret = (void *)__get_free_pages(flag, order);
496 if (!ret) 498 if (!ret)
@@ -510,8 +512,22 @@ error:
510 return ret; 512 return ret;
511} 513}
512 514
515static void calgary_free_coherent(struct device *dev, size_t size,
516 void *vaddr, dma_addr_t dma_handle)
517{
518 unsigned int npages;
519 struct iommu_table *tbl = find_iommu_table(dev);
520
521 size = PAGE_ALIGN(size);
522 npages = size >> PAGE_SHIFT;
523
524 iommu_free(tbl, dma_handle, npages);
525 free_pages((unsigned long)vaddr, get_order(size));
526}
527
513static struct dma_mapping_ops calgary_dma_ops = { 528static struct dma_mapping_ops calgary_dma_ops = {
514 .alloc_coherent = calgary_alloc_coherent, 529 .alloc_coherent = calgary_alloc_coherent,
530 .free_coherent = calgary_free_coherent,
515 .map_single = calgary_map_single, 531 .map_single = calgary_map_single,
516 .unmap_single = calgary_unmap_single, 532 .unmap_single = calgary_unmap_single,
517 .map_sg = calgary_map_sg, 533 .map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 87d4d6964ec2..0a3824e837b4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address);
41/* Dummy device used for NULL arguments (normally ISA). Better would 41/* Dummy device used for NULL arguments (normally ISA). Better would
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 42 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 43 to older i386. */
44struct device fallback_dev = { 44struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 45 .bus_id = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 46 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &fallback_dev.coherent_dma_mask, 47 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 48};
49EXPORT_SYMBOL(x86_dma_fallback_dev);
49 50
50int dma_set_mask(struct device *dev, u64 mask) 51int dma_set_mask(struct device *dev, u64 mask)
51{ 52{
@@ -82,7 +83,7 @@ void __init dma32_reserve_bootmem(void)
82 * using 512M as goal 83 * using 512M as goal
83 */ 84 */
84 align = 64ULL<<20; 85 align = 64ULL<<20;
85 size = round_up(dma32_bootmem_size, align); 86 size = roundup(dma32_bootmem_size, align);
86 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 87 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
87 512ULL<<20); 88 512ULL<<20);
88 if (dma32_bootmem_ptr) 89 if (dma32_bootmem_ptr)
@@ -133,6 +134,37 @@ unsigned long iommu_num_pages(unsigned long addr, unsigned long len)
133EXPORT_SYMBOL(iommu_num_pages); 134EXPORT_SYMBOL(iommu_num_pages);
134#endif 135#endif
135 136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag)
139{
140 unsigned long dma_mask;
141 struct page *page;
142 dma_addr_t addr;
143
144 dma_mask = dma_alloc_coherent_mask(dev, flag);
145
146 flag |= __GFP_ZERO;
147again:
148 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
149 if (!page)
150 return NULL;
151
152 addr = page_to_phys(page);
153 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
154 __free_pages(page, get_order(size));
155
156 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
157 flag = (flag & ~GFP_DMA32) | GFP_DMA;
158 goto again;
159 }
160
161 return NULL;
162 }
163
164 *dma_addr = addr;
165 return page_address(page);
166}
167
136/* 168/*
137 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 169 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
138 * documentation. 170 * documentation.
@@ -241,147 +273,6 @@ int dma_supported(struct device *dev, u64 mask)
241} 273}
242EXPORT_SYMBOL(dma_supported); 274EXPORT_SYMBOL(dma_supported);
243 275
244/* Allocate DMA memory on node near device */
245static noinline struct page *
246dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
247{
248 int node;
249
250 node = dev_to_node(dev);
251
252 return alloc_pages_node(node, gfp, order);
253}
254
255/*
256 * Allocate memory for a coherent mapping.
257 */
258void *
259dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
260 gfp_t gfp)
261{
262 struct dma_mapping_ops *ops = get_dma_ops(dev);
263 void *memory = NULL;
264 struct page *page;
265 unsigned long dma_mask = 0;
266 dma_addr_t bus;
267 int noretry = 0;
268
269 /* ignore region specifiers */
270 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
271
272 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
273 return memory;
274
275 if (!dev) {
276 dev = &fallback_dev;
277 gfp |= GFP_DMA;
278 }
279 dma_mask = dev->coherent_dma_mask;
280 if (dma_mask == 0)
281 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
282
283 /* Device not DMA able */
284 if (dev->dma_mask == NULL)
285 return NULL;
286
287 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
288 if (gfp & __GFP_DMA)
289 noretry = 1;
290
291#ifdef CONFIG_X86_64
292 /* Why <=? Even when the mask is smaller than 4GB it is often
293 larger than 16MB and in this case we have a chance of
294 finding fitting memory in the next higher zone first. If
295 not retry with true GFP_DMA. -AK */
296 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
297 gfp |= GFP_DMA32;
298 if (dma_mask < DMA_32BIT_MASK)
299 noretry = 1;
300 }
301#endif
302
303 again:
304 page = dma_alloc_pages(dev,
305 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
306 if (page == NULL)
307 return NULL;
308
309 {
310 int high, mmu;
311 bus = page_to_phys(page);
312 memory = page_address(page);
313 high = (bus + size) >= dma_mask;
314 mmu = high;
315 if (force_iommu && !(gfp & GFP_DMA))
316 mmu = 1;
317 else if (high) {
318 free_pages((unsigned long)memory,
319 get_order(size));
320
321 /* Don't use the 16MB ZONE_DMA unless absolutely
322 needed. It's better to use remapping first. */
323 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
324 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
325 goto again;
326 }
327
328 /* Let low level make its own zone decisions */
329 gfp &= ~(GFP_DMA32|GFP_DMA);
330
331 if (ops->alloc_coherent)
332 return ops->alloc_coherent(dev, size,
333 dma_handle, gfp);
334 return NULL;
335 }
336
337 memset(memory, 0, size);
338 if (!mmu) {
339 *dma_handle = bus;
340 return memory;
341 }
342 }
343
344 if (ops->alloc_coherent) {
345 free_pages((unsigned long)memory, get_order(size));
346 gfp &= ~(GFP_DMA|GFP_DMA32);
347 return ops->alloc_coherent(dev, size, dma_handle, gfp);
348 }
349
350 if (ops->map_simple) {
351 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
352 size,
353 PCI_DMA_BIDIRECTIONAL);
354 if (*dma_handle != bad_dma_address)
355 return memory;
356 }
357
358 if (panic_on_overflow)
359 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
360 (unsigned long)size);
361 free_pages((unsigned long)memory, get_order(size));
362 return NULL;
363}
364EXPORT_SYMBOL(dma_alloc_coherent);
365
366/*
367 * Unmap coherent memory.
368 * The caller must ensure that the device has finished accessing the mapping.
369 */
370void dma_free_coherent(struct device *dev, size_t size,
371 void *vaddr, dma_addr_t bus)
372{
373 struct dma_mapping_ops *ops = get_dma_ops(dev);
374
375 int order = get_order(size);
376 WARN_ON(irqs_disabled()); /* for portability */
377 if (dma_release_from_coherent(dev, order, vaddr))
378 return;
379 if (ops->unmap_single)
380 ops->unmap_single(dev, bus, size, 0);
381 free_pages((unsigned long)vaddr, order);
382}
383EXPORT_SYMBOL(dma_free_coherent);
384
385static int __init pci_iommu_init(void) 276static int __init pci_iommu_init(void)
386{ 277{
387 calgary_iommu_init(); 278 calgary_iommu_init();
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 49285f8fd4d5..145f1c83369f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
80AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
81 81
82static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
83static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
84 84
85static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
86{ 87{
87 unsigned long offset, flags; 88 unsigned long offset, flags;
88 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
90 91
91 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
92 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
93 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
94 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
95 96
96 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
97 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
98 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
99 if (offset == -1) { 100 if (offset == -1) {
100 need_flush = 1; 101 need_flush = true;
101 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
102 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
103 } 105 }
104 if (offset != -1) { 106 if (offset != -1) {
105 next_bit = offset+size; 107 next_bit = offset+size;
106 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
107 next_bit = 0; 109 next_bit = 0;
108 need_flush = 1; 110 need_flush = true;
109 } 111 }
110 } 112 }
111 if (iommu_fullflush) 113 if (iommu_fullflush)
112 need_flush = 1; 114 need_flush = true;
113 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
114 116
115 return offset; 117 return offset;
@@ -134,7 +136,7 @@ static void flush_gart(void)
134 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
135 if (need_flush) { 137 if (need_flush) {
136 k8_flush_garts(); 138 k8_flush_garts();
137 need_flush = 0; 139 need_flush = false;
138 } 140 }
139 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
140} 142}
@@ -173,7 +175,8 @@ static void dump_leak(void)
173 iommu_leak_pages); 175 iommu_leak_pages);
174 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
175 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
176 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
177 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
178 } 181 }
179 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
212static inline int 215static inline int
213need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
214{ 217{
215 u64 mask = *dev->dma_mask; 218 return force_iommu ||
216 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
217 int mmu = high;
218
219 if (force_iommu)
220 mmu = 1;
221
222 return mmu;
223} 220}
224 221
225static inline int 222static inline int
226nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
227{ 224{
228 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
229 int high = addr + size > mask;
230 int mmu = high;
231
232 return mmu;
233} 226}
234 227
235/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
236 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
237 */ 230 */
238static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
239 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
240{ 233{
241 unsigned long npages = iommu_num_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size);
242 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
243 int i; 236 int i;
244 237
245 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
259 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
260} 253}
261 254
262static dma_addr_t
263gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
264{
265 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
266
267 flush_gart();
268
269 return map;
270}
271
272/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
273static dma_addr_t 256static dma_addr_t
274gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
276 unsigned long bus; 259 unsigned long bus;
277 260
278 if (!dev) 261 if (!dev)
279 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
280 263
281 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
282 return paddr; 265 return paddr;
283 266
284 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
285 269
286 return bus; 270 return bus;
287} 271}
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
340 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
341 325
342 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
343 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
344 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
345 if (i > 0) 329 if (i > 0)
346 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
362 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
363 unsigned long pages) 347 unsigned long pages)
364{ 348{
365 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
366 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
367 struct scatterlist *s; 351 struct scatterlist *s;
368 int i; 352 int i;
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
427 return 0; 411 return 0;
428 412
429 if (!dev) 413 if (!dev)
430 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
431 415
432 out = 0; 416 out = 0;
433 start = 0; 417 start = 0;
@@ -499,6 +483,46 @@ error:
499 return 0; 483 return 0;
500} 484}
501 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
502static int no_agp; 526static int no_agp;
503 527
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
626 struct pci_dev *dev; 650 struct pci_dev *dev;
627 void *gatt; 651 void *gatt;
628 int i, error; 652 int i, error;
629 unsigned long start_pfn, end_pfn;
630 653
631 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
632 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
650 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
651 674
652 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
653 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
654 if (!gatt) 678 if (!gatt)
655 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
656 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
657 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
658 682
659 memset(gatt, 0, gatt_size);
660 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
661 684
662 enable_gart_translations(); 685 enable_gart_translations();
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
665 if (!error) 688 if (!error)
666 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
667 if (error) 690 if (error)
668 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
669 693
670 flush_gart(); 694 flush_gart();
671 695
672 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
673 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
674 698
675 /* need to map that range */
676 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
677 if (end_pfn > max_low_pfn_mapped) {
678 start_pfn = (aper_base>>PAGE_SHIFT);
679 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
680 }
681 return 0; 699 return 0;
682 700
683 nommu: 701 nommu:
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
687 return -1; 705 return -1;
688} 706}
689 707
690extern int agp_amd64_init(void);
691
692static struct dma_mapping_ops gart_dma_ops = { 708static struct dma_mapping_ops gart_dma_ops = {
693 .map_single = gart_map_single, 709 .map_single = gart_map_single,
694 .map_simple = gart_map_simple,
695 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
696 .sync_single_for_cpu = NULL,
697 .sync_single_for_device = NULL,
698 .sync_single_range_for_cpu = NULL,
699 .sync_single_range_for_device = NULL,
700 .sync_sg_for_cpu = NULL,
701 .sync_sg_for_device = NULL,
702 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
703 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
704}; 715};
705 716
706void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void)
727{ 738{
728 struct agp_kern_info info; 739 struct agp_kern_info info;
729 unsigned long iommu_start; 740 unsigned long iommu_start;
730 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
731 unsigned long scratch; 743 unsigned long scratch;
732 long i; 744 long i;
733 745
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void)
759 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
760 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
761 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
762 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
763 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
764 } 776 }
765 return; 777 return;
766 } 778 }
767 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
768 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
769 aper_size = info.aper_size * 1024 * 1024;
770 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
771 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
772 792
773 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
774 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
775 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
776 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
777 memset(iommu_gart_bitmap, 0, iommu_pages/8);
778 797
779#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
780 if (leak_trace) { 799 if (leak_trace) {
781 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
782 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
783 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
784 memset(iommu_leak_tab, 0, iommu_pages * 8);
785 else
786 printk(KERN_DEBUG 803 printk(KERN_DEBUG
787 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
788 } 805 }
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void)
792 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
793 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
794 */ 811 */
795 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
796 813
797 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
798 printk(KERN_INFO 815 printk(KERN_INFO
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p)
850 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
851 leak_trace = 1; 868 leak_trace = 1;
852 p += 4; 869 p += 4;
853 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
854 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
855 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
856 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3f91f71cdc3e..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76 dma_addr_t dma_addr)
77{
78 free_pages((unsigned long)vaddr, get_order(size));
79}
80
75struct dma_mapping_ops nommu_dma_ops = { 81struct dma_mapping_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
76 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
77 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
78 .is_phys = 1, 86 .is_phys = 1,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc4d5b0a6a0..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
19 18
20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
21{ 20{
@@ -185,7 +184,8 @@ static void mwait_idle(void)
185static void poll_idle(void) 184static void poll_idle(void)
186{ 185{
187 local_irq_enable(); 186 local_irq_enable();
188 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
189} 189}
190 190
191/* 191/*
@@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
246 return 1; 246 return 1;
247} 247}
248 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
249/* 257/*
250 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
251 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
253 */ 261 */
254static void c1e_idle(void) 262static void c1e_idle(void)
255{ 263{
256 static cpumask_t c1e_mask = CPU_MASK_NONE;
257 static int c1e_detected;
258
259 if (need_resched()) 264 if (need_resched())
260 return; 265 return;
261 266
@@ -265,8 +270,10 @@ static void c1e_idle(void)
265 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
266 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
267 c1e_detected = 1; 272 c1e_detected = 1;
268 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
269 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
270 } 277 }
271 } 278 }
272 279
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 2c9abc95e026..205188db9626 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,7 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
58#include <asm/syscalls.h> 60#include <asm/syscalls.h>
59#include <asm/smp.h> 61#include <asm/smp.h>
60 62
@@ -90,6 +92,7 @@ static void cpu_exit_clear(void)
90 cpu_clear(cpu, cpu_callin_map); 92 cpu_clear(cpu, cpu_callin_map);
91 93
92 numa_remove_cpu(cpu); 94 numa_remove_cpu(cpu);
95 c1e_remove_cpu(cpu);
93} 96}
94 97
95/* We don't actually take CPU down, just spin without interrupts. */ 98/* We don't actually take CPU down, just spin without interrupts. */
@@ -161,6 +164,7 @@ void __show_registers(struct pt_regs *regs, int all)
161 unsigned long d0, d1, d2, d3, d6, d7; 164 unsigned long d0, d1, d2, d3, d6, d7;
162 unsigned long sp; 165 unsigned long sp;
163 unsigned short ss, gs; 166 unsigned short ss, gs;
167 const char *board;
164 168
165 if (user_mode_vm(regs)) { 169 if (user_mode_vm(regs)) {
166 sp = regs->sp; 170 sp = regs->sp;
@@ -173,11 +177,15 @@ void __show_registers(struct pt_regs *regs, int all)
173 } 177 }
174 178
175 printk("\n"); 179 printk("\n");
176 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 180
181 board = dmi_get_system_info(DMI_PRODUCT_NAME);
182 if (!board)
183 board = "";
184 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
177 task_pid_nr(current), current->comm, 185 task_pid_nr(current), current->comm,
178 print_tainted(), init_utsname()->release, 186 print_tainted(), init_utsname()->release,
179 (int)strcspn(init_utsname()->version, " "), 187 (int)strcspn(init_utsname()->version, " "),
180 init_utsname()->version); 188 init_utsname()->version, board);
181 189
182 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 190 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
183 (u16)regs->cs, regs->ip, regs->flags, 191 (u16)regs->cs, regs->ip, regs->flags,
@@ -277,6 +285,14 @@ void exit_thread(void)
277 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 285 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
278 put_cpu(); 286 put_cpu();
279 } 287 }
288#ifdef CONFIG_X86_DS
289 /* Free any DS contexts that have not been properly released. */
290 if (unlikely(current->thread.ds_ctx)) {
291 /* we clear debugctl to make sure DS is not used. */
292 update_debugctlmsr(0);
293 ds_free(current->thread.ds_ctx);
294 }
295#endif /* CONFIG_X86_DS */
280} 296}
281 297
282void flush_thread(void) 298void flush_thread(void)
@@ -438,6 +454,35 @@ int set_tsc_mode(unsigned int val)
438 return 0; 454 return 0;
439} 455}
440 456
457#ifdef CONFIG_X86_DS
458static int update_debugctl(struct thread_struct *prev,
459 struct thread_struct *next, unsigned long debugctl)
460{
461 unsigned long ds_prev = 0;
462 unsigned long ds_next = 0;
463
464 if (prev->ds_ctx)
465 ds_prev = (unsigned long)prev->ds_ctx->ds;
466 if (next->ds_ctx)
467 ds_next = (unsigned long)next->ds_ctx->ds;
468
469 if (ds_next != ds_prev) {
470 /* we clear debugctl to make sure DS
471 * is not in use when we change it */
472 debugctl = 0;
473 update_debugctlmsr(0);
474 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
475 }
476 return debugctl;
477}
478#else
479static int update_debugctl(struct thread_struct *prev,
480 struct thread_struct *next, unsigned long debugctl)
481{
482 return debugctl;
483}
484#endif /* CONFIG_X86_DS */
485
441static noinline void 486static noinline void
442__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 487__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
443 struct tss_struct *tss) 488 struct tss_struct *tss)
@@ -448,14 +493,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
448 prev = &prev_p->thread; 493 prev = &prev_p->thread;
449 next = &next_p->thread; 494 next = &next_p->thread;
450 495
451 debugctl = prev->debugctlmsr; 496 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
452 if (next->ds_area_msr != prev->ds_area_msr) {
453 /* we clear debugctl to make sure DS
454 * is not in use when we change it */
455 debugctl = 0;
456 update_debugctlmsr(0);
457 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
458 }
459 497
460 if (next->debugctlmsr != debugctl) 498 if (next->debugctlmsr != debugctl)
461 update_debugctlmsr(next->debugctlmsr); 499 update_debugctlmsr(next->debugctlmsr);
@@ -479,13 +517,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
479 hard_enable_TSC(); 517 hard_enable_TSC();
480 } 518 }
481 519
482#ifdef X86_BTS 520#ifdef CONFIG_X86_PTRACE_BTS
483 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 521 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
484 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 522 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
485 523
486 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 524 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
487 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 525 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
488#endif 526#endif /* CONFIG_X86_PTRACE_BTS */
489 527
490 528
491 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 529 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 00263c9e6500..2a8ccb9238b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h>
41#include <linux/io.h>
40 42
41#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/i387.h> 46#include <asm/i387.h>
47#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
@@ -89,11 +89,13 @@ void exit_idle(void)
89#ifdef CONFIG_HOTPLUG_CPU 89#ifdef CONFIG_HOTPLUG_CPU
90DECLARE_PER_CPU(int, cpu_state); 90DECLARE_PER_CPU(int, cpu_state);
91 91
92#include <asm/nmi.h> 92#include <linux/nmi.h>
93/* We halt the CPU with physical CPU hotplug */ 93/* We halt the CPU with physical CPU hotplug */
94static inline void play_dead(void) 94static inline void play_dead(void)
95{ 95{
96 idle_task_exit(); 96 idle_task_exit();
97 c1e_remove_cpu(raw_smp_processor_id());
98
97 mb(); 99 mb();
98 /* Ack it */ 100 /* Ack it */
99 __get_cpu_var(cpu_state) = CPU_DEAD; 101 __get_cpu_var(cpu_state) = CPU_DEAD;
@@ -152,7 +154,7 @@ void cpu_idle(void)
152} 154}
153 155
154/* Prints also some state that isn't saved in the pt_regs */ 156/* Prints also some state that isn't saved in the pt_regs */
155void __show_regs(struct pt_regs * regs) 157void __show_regs(struct pt_regs *regs)
156{ 158{
157 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 159 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158 unsigned long d0, d1, d2, d3, d6, d7; 160 unsigned long d0, d1, d2, d3, d6, d7;
@@ -161,59 +163,61 @@ void __show_regs(struct pt_regs * regs)
161 163
162 printk("\n"); 164 printk("\n");
163 print_modules(); 165 print_modules();
164 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 166 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
165 current->pid, current->comm, print_tainted(), 167 current->pid, current->comm, print_tainted(),
166 init_utsname()->release, 168 init_utsname()->release,
167 (int)strcspn(init_utsname()->version, " "), 169 (int)strcspn(init_utsname()->version, " "),
168 init_utsname()->version); 170 init_utsname()->version);
169 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 171 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
170 printk_address(regs->ip, 1); 172 printk_address(regs->ip, 1);
171 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 173 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
172 regs->flags); 174 regs->sp, regs->flags);
173 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 175 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
174 regs->ax, regs->bx, regs->cx); 176 regs->ax, regs->bx, regs->cx);
175 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 177 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
176 regs->dx, regs->si, regs->di); 178 regs->dx, regs->si, regs->di);
177 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 179 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
178 regs->bp, regs->r8, regs->r9); 180 regs->bp, regs->r8, regs->r9);
179 printk("R10: %016lx R11: %016lx R12: %016lx\n", 181 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
180 regs->r10, regs->r11, regs->r12); 182 regs->r10, regs->r11, regs->r12);
181 printk("R13: %016lx R14: %016lx R15: %016lx\n", 183 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
182 regs->r13, regs->r14, regs->r15); 184 regs->r13, regs->r14, regs->r15);
183 185
184 asm("movl %%ds,%0" : "=r" (ds)); 186 asm("movl %%ds,%0" : "=r" (ds));
185 asm("movl %%cs,%0" : "=r" (cs)); 187 asm("movl %%cs,%0" : "=r" (cs));
186 asm("movl %%es,%0" : "=r" (es)); 188 asm("movl %%es,%0" : "=r" (es));
187 asm("movl %%fs,%0" : "=r" (fsindex)); 189 asm("movl %%fs,%0" : "=r" (fsindex));
188 asm("movl %%gs,%0" : "=r" (gsindex)); 190 asm("movl %%gs,%0" : "=r" (gsindex));
189 191
190 rdmsrl(MSR_FS_BASE, fs); 192 rdmsrl(MSR_FS_BASE, fs);
191 rdmsrl(MSR_GS_BASE, gs); 193 rdmsrl(MSR_GS_BASE, gs);
192 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 194 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
193 195
194 cr0 = read_cr0(); 196 cr0 = read_cr0();
195 cr2 = read_cr2(); 197 cr2 = read_cr2();
196 cr3 = read_cr3(); 198 cr3 = read_cr3();
197 cr4 = read_cr4(); 199 cr4 = read_cr4();
198 200
199 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 201 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 fs,fsindex,gs,gsindex,shadowgs); 202 fs, fsindex, gs, gsindex, shadowgs);
201 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 203 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
202 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 204 es, cr0);
205 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
206 cr4);
203 207
204 get_debugreg(d0, 0); 208 get_debugreg(d0, 0);
205 get_debugreg(d1, 1); 209 get_debugreg(d1, 1);
206 get_debugreg(d2, 2); 210 get_debugreg(d2, 2);
207 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 211 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 get_debugreg(d3, 3); 212 get_debugreg(d3, 3);
209 get_debugreg(d6, 6); 213 get_debugreg(d6, 6);
210 get_debugreg(d7, 7); 214 get_debugreg(d7, 7);
211 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 215 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 216}
213 217
214void show_regs(struct pt_regs *regs) 218void show_regs(struct pt_regs *regs)
215{ 219{
216 printk("CPU %d:", smp_processor_id()); 220 printk(KERN_INFO "CPU %d:", smp_processor_id());
217 __show_regs(regs); 221 __show_regs(regs);
218 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 222 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
219} 223}
@@ -239,6 +243,14 @@ void exit_thread(void)
239 t->io_bitmap_max = 0; 243 t->io_bitmap_max = 0;
240 put_cpu(); 244 put_cpu();
241 } 245 }
246#ifdef CONFIG_X86_DS
247 /* Free any DS contexts that have not been properly released. */
248 if (unlikely(t->ds_ctx)) {
249 /* we clear debugctl to make sure DS is not used. */
250 update_debugctlmsr(0);
251 ds_free(t->ds_ctx);
252 }
253#endif /* CONFIG_X86_DS */
242} 254}
243 255
244void flush_thread(void) 256void flush_thread(void)
@@ -314,10 +326,10 @@ void prepare_to_copy(struct task_struct *tsk)
314 326
315int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 327int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
316 unsigned long unused, 328 unsigned long unused,
317 struct task_struct * p, struct pt_regs * regs) 329 struct task_struct *p, struct pt_regs *regs)
318{ 330{
319 int err; 331 int err;
320 struct pt_regs * childregs; 332 struct pt_regs *childregs;
321 struct task_struct *me = current; 333 struct task_struct *me = current;
322 334
323 childregs = ((struct pt_regs *) 335 childregs = ((struct pt_regs *)
@@ -362,10 +374,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
362 if (test_thread_flag(TIF_IA32)) 374 if (test_thread_flag(TIF_IA32))
363 err = do_set_thread_area(p, -1, 375 err = do_set_thread_area(p, -1,
364 (struct user_desc __user *)childregs->si, 0); 376 (struct user_desc __user *)childregs->si, 0);
365 else 377 else
366#endif 378#endif
367 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 379 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
368 if (err) 380 if (err)
369 goto out; 381 goto out;
370 } 382 }
371 err = 0; 383 err = 0;
@@ -472,13 +484,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
472 next = &next_p->thread; 484 next = &next_p->thread;
473 485
474 debugctl = prev->debugctlmsr; 486 debugctl = prev->debugctlmsr;
475 if (next->ds_area_msr != prev->ds_area_msr) { 487
476 /* we clear debugctl to make sure DS 488#ifdef CONFIG_X86_DS
477 * is not in use when we change it */ 489 {
478 debugctl = 0; 490 unsigned long ds_prev = 0, ds_next = 0;
479 update_debugctlmsr(0); 491
480 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 492 if (prev->ds_ctx)
493 ds_prev = (unsigned long)prev->ds_ctx->ds;
494 if (next->ds_ctx)
495 ds_next = (unsigned long)next->ds_ctx->ds;
496
497 if (ds_next != ds_prev) {
498 /*
499 * We clear debugctl to make sure DS
500 * is not in use when we change it:
501 */
502 debugctl = 0;
503 update_debugctlmsr(0);
504 wrmsrl(MSR_IA32_DS_AREA, ds_next);
505 }
481 } 506 }
507#endif /* CONFIG_X86_DS */
482 508
483 if (next->debugctlmsr != debugctl) 509 if (next->debugctlmsr != debugctl)
484 update_debugctlmsr(next->debugctlmsr); 510 update_debugctlmsr(next->debugctlmsr);
@@ -516,13 +542,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
516 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 542 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
517 } 543 }
518 544
519#ifdef X86_BTS 545#ifdef CONFIG_X86_PTRACE_BTS
520 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 546 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
521 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 547 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
522 548
523 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 549 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
524 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 550 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
525#endif 551#endif /* CONFIG_X86_PTRACE_BTS */
526} 552}
527 553
528/* 554/*
@@ -544,7 +570,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
544 unsigned fsindex, gsindex; 570 unsigned fsindex, gsindex;
545 571
546 /* we're going to use this soon, after a few expensive things */ 572 /* we're going to use this soon, after a few expensive things */
547 if (next_p->fpu_counter>5) 573 if (next_p->fpu_counter > 5)
548 prefetch(next->xstate); 574 prefetch(next->xstate);
549 575
550 /* 576 /*
@@ -552,13 +578,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552 */ 578 */
553 load_sp0(tss, next); 579 load_sp0(tss, next);
554 580
555 /* 581 /*
556 * Switch DS and ES. 582 * Switch DS and ES.
557 * This won't pick up thread selector changes, but I guess that is ok. 583 * This won't pick up thread selector changes, but I guess that is ok.
558 */ 584 */
559 savesegment(es, prev->es); 585 savesegment(es, prev->es);
560 if (unlikely(next->es | prev->es)) 586 if (unlikely(next->es | prev->es))
561 loadsegment(es, next->es); 587 loadsegment(es, next->es);
562 588
563 savesegment(ds, prev->ds); 589 savesegment(ds, prev->ds);
564 if (unlikely(next->ds | prev->ds)) 590 if (unlikely(next->ds | prev->ds))
@@ -584,7 +610,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584 */ 610 */
585 arch_leave_lazy_cpu_mode(); 611 arch_leave_lazy_cpu_mode();
586 612
587 /* 613 /*
588 * Switch FS and GS. 614 * Switch FS and GS.
589 * 615 *
590 * Segment register != 0 always requires a reload. Also 616 * Segment register != 0 always requires a reload. Also
@@ -593,13 +619,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
593 */ 619 */
594 if (unlikely(fsindex | next->fsindex | prev->fs)) { 620 if (unlikely(fsindex | next->fsindex | prev->fs)) {
595 loadsegment(fs, next->fsindex); 621 loadsegment(fs, next->fsindex);
596 /* 622 /*
597 * Check if the user used a selector != 0; if yes 623 * Check if the user used a selector != 0; if yes
598 * clear 64bit base, since overloaded base is always 624 * clear 64bit base, since overloaded base is always
599 * mapped to the Null selector 625 * mapped to the Null selector
600 */ 626 */
601 if (fsindex) 627 if (fsindex)
602 prev->fs = 0; 628 prev->fs = 0;
603 } 629 }
604 /* when next process has a 64bit base use it */ 630 /* when next process has a 64bit base use it */
605 if (next->fs) 631 if (next->fs)
@@ -609,7 +635,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
609 if (unlikely(gsindex | next->gsindex | prev->gs)) { 635 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex); 636 load_gs_index(next->gsindex);
611 if (gsindex) 637 if (gsindex)
612 prev->gs = 0; 638 prev->gs = 0;
613 } 639 }
614 if (next->gs) 640 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 641 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -618,12 +644,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
618 /* Must be after DS reload */ 644 /* Must be after DS reload */
619 unlazy_fpu(prev_p); 645 unlazy_fpu(prev_p);
620 646
621 /* 647 /*
622 * Switch the PDA and FPU contexts. 648 * Switch the PDA and FPU contexts.
623 */ 649 */
624 prev->usersp = read_pda(oldrsp); 650 prev->usersp = read_pda(oldrsp);
625 write_pda(oldrsp, next->usersp); 651 write_pda(oldrsp, next->usersp);
626 write_pda(pcurrent, next_p); 652 write_pda(pcurrent, next_p);
627 653
628 write_pda(kernelstack, 654 write_pda(kernelstack,
629 (unsigned long)task_stack_page(next_p) + 655 (unsigned long)task_stack_page(next_p) +
@@ -664,7 +690,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
664 char __user * __user *envp, struct pt_regs *regs) 690 char __user * __user *envp, struct pt_regs *regs)
665{ 691{
666 long error; 692 long error;
667 char * filename; 693 char *filename;
668 694
669 filename = getname(name); 695 filename = getname(name);
670 error = PTR_ERR(filename); 696 error = PTR_ERR(filename);
@@ -722,55 +748,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
722unsigned long get_wchan(struct task_struct *p) 748unsigned long get_wchan(struct task_struct *p)
723{ 749{
724 unsigned long stack; 750 unsigned long stack;
725 u64 fp,ip; 751 u64 fp, ip;
726 int count = 0; 752 int count = 0;
727 753
728 if (!p || p == current || p->state==TASK_RUNNING) 754 if (!p || p == current || p->state == TASK_RUNNING)
729 return 0; 755 return 0;
730 stack = (unsigned long)task_stack_page(p); 756 stack = (unsigned long)task_stack_page(p);
731 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 757 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
732 return 0; 758 return 0;
733 fp = *(u64 *)(p->thread.sp); 759 fp = *(u64 *)(p->thread.sp);
734 do { 760 do {
735 if (fp < (unsigned long)stack || 761 if (fp < (unsigned long)stack ||
736 fp > (unsigned long)stack+THREAD_SIZE) 762 fp > (unsigned long)stack+THREAD_SIZE)
737 return 0; 763 return 0;
738 ip = *(u64 *)(fp+8); 764 ip = *(u64 *)(fp+8);
739 if (!in_sched_functions(ip)) 765 if (!in_sched_functions(ip))
740 return ip; 766 return ip;
741 fp = *(u64 *)fp; 767 fp = *(u64 *)fp;
742 } while (count++ < 16); 768 } while (count++ < 16);
743 return 0; 769 return 0;
744} 770}
745 771
746long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 772long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
747{ 773{
748 int ret = 0; 774 int ret = 0;
749 int doit = task == current; 775 int doit = task == current;
750 int cpu; 776 int cpu;
751 777
752 switch (code) { 778 switch (code) {
753 case ARCH_SET_GS: 779 case ARCH_SET_GS:
754 if (addr >= TASK_SIZE_OF(task)) 780 if (addr >= TASK_SIZE_OF(task))
755 return -EPERM; 781 return -EPERM;
756 cpu = get_cpu(); 782 cpu = get_cpu();
757 /* handle small bases via the GDT because that's faster to 783 /* handle small bases via the GDT because that's faster to
758 switch. */ 784 switch. */
759 if (addr <= 0xffffffff) { 785 if (addr <= 0xffffffff) {
760 set_32bit_tls(task, GS_TLS, addr); 786 set_32bit_tls(task, GS_TLS, addr);
761 if (doit) { 787 if (doit) {
762 load_TLS(&task->thread, cpu); 788 load_TLS(&task->thread, cpu);
763 load_gs_index(GS_TLS_SEL); 789 load_gs_index(GS_TLS_SEL);
764 } 790 }
765 task->thread.gsindex = GS_TLS_SEL; 791 task->thread.gsindex = GS_TLS_SEL;
766 task->thread.gs = 0; 792 task->thread.gs = 0;
767 } else { 793 } else {
768 task->thread.gsindex = 0; 794 task->thread.gsindex = 0;
769 task->thread.gs = addr; 795 task->thread.gs = addr;
770 if (doit) { 796 if (doit) {
771 load_gs_index(0); 797 load_gs_index(0);
772 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 798 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
773 } 799 }
774 } 800 }
775 put_cpu(); 801 put_cpu();
776 break; 802 break;
@@ -824,8 +850,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
824 rdmsrl(MSR_KERNEL_GS_BASE, base); 850 rdmsrl(MSR_KERNEL_GS_BASE, base);
825 else 851 else
826 base = task->thread.gs; 852 base = task->thread.gs;
827 } 853 } else
828 else
829 base = task->thread.gs; 854 base = task->thread.gs;
830 ret = put_user(base, (unsigned long __user *)addr); 855 ret = put_user(base, (unsigned long __user *)addr);
831 break; 856 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fc3e8dcd9da6..e375b658efc3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -554,45 +555,115 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 555 return 0;
555} 556}
556 557
557#ifdef X86_BTS 558#ifdef CONFIG_X86_PTRACE_BTS
559/*
560 * The configuration for a particular BTS hardware implementation.
561 */
562struct bts_configuration {
563 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
564 unsigned char sizeof_bts;
565 /* the size of a field in the BTS record in bytes */
566 unsigned char sizeof_field;
567 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
568 unsigned long debugctl_mask;
569};
570static struct bts_configuration bts_cfg;
571
572#define BTS_MAX_RECORD_SIZE (8 * 3)
573
574
575/*
576 * Branch Trace Store (BTS) uses the following format. Different
577 * architectures vary in the size of those fields.
578 * - source linear address
579 * - destination linear address
580 * - flags
581 *
582 * Later architectures use 64bit pointers throughout, whereas earlier
583 * architectures use 32bit pointers in 32bit mode.
584 *
585 * We compute the base address for the first 8 fields based on:
586 * - the field size stored in the DS configuration
587 * - the relative field position
588 *
589 * In order to store additional information in the BTS buffer, we use
590 * a special source address to indicate that the record requires
591 * special interpretation.
592 *
593 * Netburst indicated via a bit in the flags field whether the branch
594 * was predicted; this is ignored.
595 */
596
597enum bts_field {
598 bts_from = 0,
599 bts_to,
600 bts_flags,
601
602 bts_escape = (unsigned long)-1,
603 bts_qual = bts_to,
604 bts_jiffies = bts_flags
605};
606
607static inline unsigned long bts_get(const char *base, enum bts_field field)
608{
609 base += (bts_cfg.sizeof_field * field);
610 return *(unsigned long *)base;
611}
558 612
559static int ptrace_bts_get_size(struct task_struct *child) 613static inline void bts_set(char *base, enum bts_field field, unsigned long val)
560{ 614{
561 if (!child->thread.ds_area_msr) 615 base += (bts_cfg.sizeof_field * field);;
562 return -ENXIO; 616 (*(unsigned long *)base) = val;
617}
563 618
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 619/*
620 * Translate a BTS record from the raw format into the bts_struct format
621 *
622 * out (out): bts_struct interpretation
623 * raw: raw BTS record
624 */
625static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
626{
627 memset(out, 0, sizeof(*out));
628 if (bts_get(raw, bts_from) == bts_escape) {
629 out->qualifier = bts_get(raw, bts_qual);
630 out->variant.jiffies = bts_get(raw, bts_jiffies);
631 } else {
632 out->qualifier = BTS_BRANCH;
633 out->variant.lbr.from_ip = bts_get(raw, bts_from);
634 out->variant.lbr.to_ip = bts_get(raw, bts_to);
635 }
565} 636}
566 637
567static int ptrace_bts_read_record(struct task_struct *child, 638static int ptrace_bts_read_record(struct task_struct *child, size_t index,
568 long index,
569 struct bts_struct __user *out) 639 struct bts_struct __user *out)
570{ 640{
571 struct bts_struct ret; 641 struct bts_struct ret;
572 int retval; 642 const void *bts_record;
573 int bts_end; 643 size_t bts_index, bts_end;
574 int bts_index; 644 int error;
575
576 if (!child->thread.ds_area_msr)
577 return -ENXIO;
578 645
579 if (index < 0) 646 error = ds_get_bts_end(child, &bts_end);
580 return -EINVAL; 647 if (error < 0)
648 return error;
581 649
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 650 if (bts_end <= index)
584 return -EINVAL; 651 return -EINVAL;
585 652
653 error = ds_get_bts_index(child, &bts_index);
654 if (error < 0)
655 return error;
656
586 /* translate the ptrace bts index into the ds bts index */ 657 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 658 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 659 if (bts_end <= bts_index)
589 if (bts_index < 0) 660 bts_index -= bts_end;
590 bts_index += bts_end;
591 661
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 662 error = ds_access_bts(child, bts_index, &bts_record);
593 bts_index, &ret); 663 if (error < 0)
594 if (retval < 0) 664 return error;
595 return retval; 665
666 ptrace_bts_translate_record(&ret, bts_record);
596 667
597 if (copy_to_user(out, &ret, sizeof(ret))) 668 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 669 return -EFAULT;
@@ -600,101 +671,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 671 return sizeof(ret);
601} 672}
602 673
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 674static int ptrace_bts_drain(struct task_struct *child,
612 long size, 675 long size,
613 struct bts_struct __user *out) 676 struct bts_struct __user *out)
614{ 677{
615 int end, i; 678 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 679 const unsigned char *raw;
617 680 size_t end, i;
618 if (!ds) 681 int error;
619 return -ENXIO;
620 682
621 end = ds_get_bts_index(ds); 683 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 684 if (error < 0)
623 return end; 685 return error;
624 686
625 if (size < (end * sizeof(struct bts_struct))) 687 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 688 return -EIO;
627 689
628 for (i = 0; i < end; i++, out++) { 690 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 691 if (error < 0)
630 int retval; 692 return error;
631 693
632 retval = ds_read_bts(ds, i, &ret); 694 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 695 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 696
636 if (copy_to_user(out, &ret, sizeof(ret))) 697 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 698 return -EFAULT;
638 } 699 }
639 700
640 ds_clear(ds); 701 error = ds_clear_bts(child);
702 if (error < 0)
703 return error;
641 704
642 return end; 705 return end;
643} 706}
644 707
708static void ptrace_bts_ovfl(struct task_struct *child)
709{
710 send_sig(child->thread.bts_ovfl_signal, child, 0);
711}
712
645static int ptrace_bts_config(struct task_struct *child, 713static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 714 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 715 const struct ptrace_bts_config __user *ucfg)
648{ 716{
649 struct ptrace_bts_config cfg; 717 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 718 int error = 0;
651 void *ds; 719
720 error = -EOPNOTSUPP;
721 if (!bts_cfg.sizeof_bts)
722 goto errout;
652 723
724 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 725 if (cfg_size < sizeof(cfg))
654 return -EIO; 726 goto errout;
655 727
728 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 729 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 730 goto errout;
658 731
659 if ((int)cfg.size < 0) 732 error = -EINVAL;
660 return -EINVAL; 733 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
734 !(cfg.flags & PTRACE_BTS_O_ALLOC))
735 goto errout;
661 736
662 bts_size = 0; 737 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 738 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 739 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds); 740
666 if (bts_size < 0) 741 /* we ignore the error in case we were not tracing child */
667 return bts_size; 742 (void)ds_release_bts(child);
668 }
669 cfg.size = PAGE_ALIGN(cfg.size);
670 743
671 if (bts_size != cfg.size) { 744 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
672 ret = ptrace_bts_realloc(child, cfg.size, 745 if (!cfg.signal)
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE); 746 goto errout;
674 if (ret < 0) 747
748 sig = cfg.signal;
749 ovfl = ptrace_bts_ovfl;
750 }
751
752 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
753 if (error < 0)
675 goto errout; 754 goto errout;
676 755
677 ds = (void *)child->thread.ds_area_msr; 756 child->thread.bts_ovfl_signal = sig;
678 } 757 }
679 758
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 759 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 760 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 761 goto errout;
686 762
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 763 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 764 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 765 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 766 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 767
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 768 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 769 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 770 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 771 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 772
697 ret = sizeof(cfg); 773 error = sizeof(cfg);
698 774
699out: 775out:
700 if (child->thread.debugctlmsr) 776 if (child->thread.debugctlmsr)
@@ -702,10 +778,10 @@ out:
702 else 778 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 779 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 780
705 return ret; 781 return error;
706 782
707errout: 783errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 784 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 785 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 786 goto out;
711} 787}
@@ -714,29 +790,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 790 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 791 struct ptrace_bts_config __user *ucfg)
716{ 792{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 793 struct ptrace_bts_config cfg;
794 size_t end;
795 const void *base, *max;
796 int error;
719 797
720 if (cfg_size < sizeof(cfg)) 798 if (cfg_size < sizeof(cfg))
721 return -EIO; 799 return -EIO;
722 800
723 memset(&cfg, 0, sizeof(cfg)); 801 error = ds_get_bts_end(child, &end);
802 if (error < 0)
803 return error;
724 804
725 if (ds) { 805 error = ds_access_bts(child, /* index = */ 0, &base);
726 cfg.size = ds_get_bts_size(ds); 806 if (error < 0)
807 return error;
727 808
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 809 error = ds_access_bts(child, /* index = */ end, &max);
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 810 if (error < 0)
811 return error;
730 812
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 813 memset(&cfg, 0, sizeof(cfg));
732 child->thread.debugctlmsr & ds_debugctl_mask()) 814 cfg.size = (max - base);
733 cfg.flags |= PTRACE_BTS_O_TRACE; 815 cfg.signal = child->thread.bts_ovfl_signal;
816 cfg.bts_size = sizeof(struct bts_struct);
734 817
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 818 if (cfg.signal)
736 cfg.flags |= PTRACE_BTS_O_SCHED; 819 cfg.flags |= PTRACE_BTS_O_SIGNAL;
737 }
738 820
739 cfg.bts_size = sizeof(struct bts_struct); 821 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
822 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
823 cfg.flags |= PTRACE_BTS_O_TRACE;
824
825 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
826 cfg.flags |= PTRACE_BTS_O_SCHED;
740 827
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 828 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 829 return -EFAULT;
@@ -744,89 +831,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 831 return sizeof(cfg);
745} 832}
746 833
747
748static int ptrace_bts_write_record(struct task_struct *child, 834static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 835 const struct bts_struct *in)
750{ 836{
751 int retval; 837 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 838
753 if (!child->thread.ds_area_msr) 839 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 840
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 841 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 842 switch (in->qualifier) {
758 return retval; 843 case BTS_INVALID:
844 break;
759 845
760 return sizeof(*in); 846 case BTS_BRANCH:
761} 847 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
848 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
849 break;
762 850
763static int ptrace_bts_realloc(struct task_struct *child, 851 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 852 case BTS_TASK_DEPARTS:
765{ 853 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 854 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 855 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
856 break;
768 857
769 if (size < 0) 858 default:
770 return -EINVAL; 859 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 }
801
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
803 vm = current->mm->locked_vm + size;
804 if (rlim < vm) {
805 ret = -ENOMEM;
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 } 860 }
814 861
815 ret = ds_allocate((void **)&child->thread.ds_area_msr, 862 /* The writing task will be the switched-to task on a context
816 size << PAGE_SHIFT); 863 * switch. It needs to write into the switched-from task's BTS
817 if (ret < 0) 864 * buffer. */
818 goto out; 865 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 866}
831 867
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 868void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +875,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 875
840 ptrace_bts_write_record(tsk, &rec); 876 ptrace_bts_write_record(tsk, &rec);
841} 877}
842#endif /* X86_BTS */ 878
879static const struct bts_configuration bts_cfg_netburst = {
880 .sizeof_bts = sizeof(long) * 3,
881 .sizeof_field = sizeof(long),
882 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
883};
884
885static const struct bts_configuration bts_cfg_pentium_m = {
886 .sizeof_bts = sizeof(long) * 3,
887 .sizeof_field = sizeof(long),
888 .debugctl_mask = (1<<6)|(1<<7)
889};
890
891static const struct bts_configuration bts_cfg_core2 = {
892 .sizeof_bts = 8 * 3,
893 .sizeof_field = 8,
894 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
895};
896
897static inline void bts_configure(const struct bts_configuration *cfg)
898{
899 bts_cfg = *cfg;
900}
901
902void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
903{
904 switch (c->x86) {
905 case 0x6:
906 switch (c->x86_model) {
907 case 0xD:
908 case 0xE: /* Pentium M */
909 bts_configure(&bts_cfg_pentium_m);
910 break;
911 case 0xF: /* Core2 */
912 case 0x1C: /* Atom */
913 bts_configure(&bts_cfg_core2);
914 break;
915 default:
916 /* sorry, don't know about them */
917 break;
918 }
919 break;
920 case 0xF:
921 switch (c->x86_model) {
922 case 0x0:
923 case 0x1:
924 case 0x2: /* Netburst */
925 bts_configure(&bts_cfg_netburst);
926 break;
927 default:
928 /* sorry, don't know about them */
929 break;
930 }
931 break;
932 default:
933 /* sorry, don't know about them */
934 break;
935 }
936}
937#endif /* CONFIG_X86_PTRACE_BTS */
843 938
844/* 939/*
845 * Called by kernel/ptrace.c when detaching.. 940 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +947,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 947#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 948 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 949#endif
855 if (child->thread.ds_area_msr) { 950#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 951 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 952
858#endif 953 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 954 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 955 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 956
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 957 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 958#endif /* CONFIG_X86_PTRACE_BTS */
864} 959}
865 960
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 961#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1075,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1075 /*
981 * These bits need more cooking - not enabled yet: 1076 * These bits need more cooking - not enabled yet:
982 */ 1077 */
983#ifdef X86_BTS 1078#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1079 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1080 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1081 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1087,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1087 break;
993 1088
994 case PTRACE_BTS_SIZE: 1089 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1090 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1091 break;
997 1092
998 case PTRACE_BTS_GET: 1093 case PTRACE_BTS_GET:
@@ -1001,14 +1096,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1096 break;
1002 1097
1003 case PTRACE_BTS_CLEAR: 1098 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1099 ret = ds_clear_bts(child);
1005 break; 1100 break;
1006 1101
1007 case PTRACE_BTS_DRAIN: 1102 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1103 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1104 (child, data, (struct bts_struct __user *) addr);
1010 break; 1105 break;
1011#endif 1106#endif /* CONFIG_X86_PTRACE_BTS */
1012 1107
1013 default: 1108 default:
1014 ret = ptrace_request(child, request, addr, data); 1109 ret = ptrace_request(child, request, addr, data);
@@ -1375,30 +1470,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1375 force_sig_info(SIGTRAP, &info, tsk); 1470 force_sig_info(SIGTRAP, &info, tsk);
1376} 1471}
1377 1472
1378static void syscall_trace(struct pt_regs *regs)
1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1382
1383#if 0
1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1385 current->comm,
1386 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1387 current_thread_info()->flags, current->ptrace);
1388#endif
1389
1390 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1391 ? 0x80 : 0));
1392 /*
1393 * this isn't the same as continuing with a signal, but it will do
1394 * for normal use. strace only continues with a signal if the
1395 * stopping signal is not SIGTRAP. -brl
1396 */
1397 if (current->exit_code) {
1398 send_sig(current->exit_code, current, 1);
1399 current->exit_code = 0;
1400 }
1401}
1402 1473
1403#ifdef CONFIG_X86_32 1474#ifdef CONFIG_X86_32
1404# define IS_IA32 1 1475# define IS_IA32 1
@@ -1432,8 +1503,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1503 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1433 ret = -1L; 1504 ret = -1L;
1434 1505
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) 1506 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1436 syscall_trace(regs); 1507 tracehook_report_syscall_entry(regs))
1508 ret = -1L;
1437 1509
1438 if (unlikely(current->audit_context)) { 1510 if (unlikely(current->audit_context)) {
1439 if (IS_IA32) 1511 if (IS_IA32)
@@ -1459,7 +1531,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1531 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1460 1532
1461 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1533 if (test_thread_flag(TIF_SYSCALL_TRACE))
1462 syscall_trace(regs); 1534 tracehook_report_syscall_exit(regs, 0);
1463 1535
1464 /* 1536 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of 1537 * If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1547,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1475 * system call instruction. 1547 * system call instruction.
1476 */ 1548 */
1477 if (test_thread_flag(TIF_SINGLESTEP) && 1549 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED)) 1550 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1479 send_sigtrap(current, regs, 0); 1551 send_sigtrap(current, regs, 0);
1480} 1552}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb9..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 673f12cf6eb0..46c98efbbf8d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
223#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
224 224
225static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
226 229
227#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
228struct edd edd; 231struct edd edd;
@@ -665,11 +668,28 @@ void __init setup_arch(char **cmdline_p)
665 bss_resource.start = virt_to_phys(&__bss_start); 668 bss_resource.start = virt_to_phys(&__bss_start);
666 bss_resource.end = virt_to_phys(&__bss_stop)-1; 669 bss_resource.end = virt_to_phys(&__bss_stop)-1;
667 670
671#ifdef CONFIG_CMDLINE_BOOL
672#ifdef CONFIG_CMDLINE_OVERRIDE
673 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
674#else
675 if (builtin_cmdline[0]) {
676 /* append boot loader cmdline to builtin */
677 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
678 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
679 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
680 }
681#endif
682#endif
683
668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 684 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
669 *cmdline_p = command_line; 685 *cmdline_p = command_line;
670 686
671 parse_early_param(); 687 parse_early_param();
672 688
689#ifdef CONFIG_X86_64
690 check_efer();
691#endif
692
673#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) 693#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
674 /* 694 /*
675 * Must be before kernel pagetables are setup 695 * Must be before kernel pagetables are setup
@@ -738,7 +758,6 @@ void __init setup_arch(char **cmdline_p)
738#else 758#else
739 num_physpages = max_pfn; 759 num_physpages = max_pfn;
740 760
741 check_efer();
742 if (cpu_has_x2apic) 761 if (cpu_has_x2apic)
743 check_x2apic(); 762 check_x2apic();
744 763
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 6dd7e2b70a4b..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -34,4 +34,9 @@ struct rt_sigframe {
34 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */ 35 /* fp state follows here */
36}; 36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
37#endif 42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 8d380b699c0c..b21070ea33a4 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -556,8 +557,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
556 * handler too. 557 * handler too.
557 */ 558 */
558 regs->flags &= ~X86_EFLAGS_TF; 559 regs->flags &= ~X86_EFLAGS_TF;
559 if (test_thread_flag(TIF_SINGLESTEP))
560 ptrace_notify(SIGTRAP);
561 560
562 spin_lock_irq(&current->sighand->siglock); 561 spin_lock_irq(&current->sighand->siglock);
563 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 562 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -566,6 +565,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
566 recalc_sigpending(); 565 recalc_sigpending();
567 spin_unlock_irq(&current->sighand->siglock); 566 spin_unlock_irq(&current->sighand->siglock);
568 567
568 tracehook_signal_handler(sig, info, ka, regs,
569 test_thread_flag(TIF_SINGLESTEP));
570
569 return 0; 571 return 0;
570} 572}
571 573
@@ -659,5 +661,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659 if (thread_info_flags & _TIF_SIGPENDING) 661 if (thread_info_flags & _TIF_SIGPENDING)
660 do_signal(regs); 662 do_signal(regs);
661 663
664 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
665 clear_thread_flag(TIF_NOTIFY_RESUME);
666 tracehook_notify_resume(regs);
667 }
668
662 clear_thread_flag(TIF_IRET); 669 clear_thread_flag(TIF_IRET);
663} 670}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 4665b598a376..823a55bf8c39 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,20 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
29#include <asm/syscalls.h> 32#include <asm/syscalls.h>
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -42,11 +45,6 @@
42# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
43#endif 46#endif
44 47
45int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
46 sigset_t *set, struct pt_regs * regs);
47int ia32_setup_frame(int sig, struct k_sigaction *ka,
48 sigset_t *set, struct pt_regs * regs);
49
50asmlinkage long 48asmlinkage long
51sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
52 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -66,7 +64,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
66 /* Always make any pending restarted system calls return -EINTR */ 64 /* Always make any pending restarted system calls return -EINTR */
67 current_thread_info()->restart_block.fn = do_no_restart_syscall; 65 current_thread_info()->restart_block.fn = do_no_restart_syscall;
68 66
69#define COPY(x) err |= __get_user(regs->x, &sc->x) 67#define COPY(x) (err |= __get_user(regs->x, &sc->x))
70 68
71 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 69 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
72 COPY(dx); COPY(cx); COPY(ip); 70 COPY(dx); COPY(cx); COPY(ip);
@@ -96,7 +94,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
96 } 94 }
97 95
98 { 96 {
99 struct _fpstate __user * buf; 97 struct _fpstate __user *buf;
100 err |= __get_user(buf, &sc->fpstate); 98 err |= __get_user(buf, &sc->fpstate);
101 err |= restore_i387_xstate(buf); 99 err |= restore_i387_xstate(buf);
102 } 100 }
@@ -122,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
122 current->blocked = set; 120 current->blocked = set;
123 recalc_sigpending(); 121 recalc_sigpending();
124 spin_unlock_irq(&current->sighand->siglock); 122 spin_unlock_irq(&current->sighand->siglock);
125 123
126 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
127 goto badframe; 125 goto badframe;
128 126
@@ -132,16 +130,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
132 return ax; 130 return ax;
133 131
134badframe: 132badframe:
135 signal_fault(regs,frame,"sigreturn"); 133 signal_fault(regs, frame, "sigreturn");
136 return 0; 134 return 0;
137} 135}
138 136
139/* 137/*
140 * Set up a signal frame. 138 * Set up a signal frame.
141 */ 139 */
142 140
143static inline int 141static inline int
144setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 142setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
143 unsigned long mask, struct task_struct *me)
145{ 144{
146 int err = 0; 145 int err = 0;
147 146
@@ -197,7 +196,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
197} 196}
198 197
199static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 198static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
200 sigset_t *set, struct pt_regs * regs) 199 sigset_t *set, struct pt_regs *regs)
201{ 200{
202 struct rt_sigframe __user *frame; 201 struct rt_sigframe __user *frame;
203 void __user *fp = NULL; 202 void __user *fp = NULL;
@@ -210,19 +209,19 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
210 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 209 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
211 210
212 if (save_i387_xstate(fp) < 0) 211 if (save_i387_xstate(fp) < 0)
213 err |= -1; 212 err |= -1;
214 } else 213 } else
215 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 214 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
216 215
217 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 216 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
218 goto give_sigsegv; 217 goto give_sigsegv;
219 218
220 if (ka->sa.sa_flags & SA_SIGINFO) { 219 if (ka->sa.sa_flags & SA_SIGINFO) {
221 err |= copy_siginfo_to_user(&frame->info, info); 220 err |= copy_siginfo_to_user(&frame->info, info);
222 if (err) 221 if (err)
223 goto give_sigsegv; 222 goto give_sigsegv;
224 } 223 }
225 224
226 /* Create the ucontext. */ 225 /* Create the ucontext. */
227 if (cpu_has_xsave) 226 if (cpu_has_xsave)
228 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); 227 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
@@ -235,9 +234,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
235 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 234 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
236 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 235 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
237 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 236 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
238 if (sizeof(*set) == 16) { 237 if (sizeof(*set) == 16) {
239 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 238 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
240 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 239 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
241 } else 240 } else
242 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 241 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
243 242
@@ -248,7 +247,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
248 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 247 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
249 } else { 248 } else {
250 /* could use a vstub here */ 249 /* could use a vstub here */
251 goto give_sigsegv; 250 goto give_sigsegv;
252 } 251 }
253 252
254 if (err) 253 if (err)
@@ -256,7 +255,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
256 255
257 /* Set up registers for signal handler */ 256 /* Set up registers for signal handler */
258 regs->di = sig; 257 regs->di = sig;
259 /* In case the signal handler was declared without prototypes */ 258 /* In case the signal handler was declared without prototypes */
260 regs->ax = 0; 259 regs->ax = 0;
261 260
262 /* This also works for non SA_SIGINFO handlers because they expect the 261 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -279,37 +278,8 @@ give_sigsegv:
279} 278}
280 279
281/* 280/*
282 * Return -1L or the syscall number that @regs is executing.
283 */
284static long current_syscall(struct pt_regs *regs)
285{
286 /*
287 * We always sign-extend a -1 value being set here,
288 * so this is always either -1L or a syscall number.
289 */
290 return regs->orig_ax;
291}
292
293/*
294 * Return a value that is -EFOO if the system call in @regs->orig_ax
295 * returned an error. This only works for @regs from @current.
296 */
297static long current_syscall_ret(struct pt_regs *regs)
298{
299#ifdef CONFIG_IA32_EMULATION
300 if (test_thread_flag(TIF_IA32))
301 /*
302 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
303 * and will match correctly in comparisons.
304 */
305 return (int) regs->ax;
306#endif
307 return regs->ax;
308}
309
310/*
311 * OK, we're invoking a handler 281 * OK, we're invoking a handler
312 */ 282 */
313 283
314static int 284static int
315handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 285handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -318,9 +288,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
318 int ret; 288 int ret;
319 289
320 /* Are we from a system call? */ 290 /* Are we from a system call? */
321 if (current_syscall(regs) >= 0) { 291 if (syscall_get_nr(current, regs) >= 0) {
322 /* If so, check system call restarting.. */ 292 /* If so, check system call restarting.. */
323 switch (current_syscall_ret(regs)) { 293 switch (syscall_get_error(current, regs)) {
324 case -ERESTART_RESTARTBLOCK: 294 case -ERESTART_RESTARTBLOCK:
325 case -ERESTARTNOHAND: 295 case -ERESTARTNOHAND:
326 regs->ax = -EINTR; 296 regs->ax = -EINTR;
@@ -353,7 +323,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
353 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); 323 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
354 else 324 else
355 ret = ia32_setup_frame(sig, ka, oldset, regs); 325 ret = ia32_setup_frame(sig, ka, oldset, regs);
356 } else 326 } else
357#endif 327#endif
358 ret = setup_rt_frame(sig, ka, info, oldset, regs); 328 ret = setup_rt_frame(sig, ka, info, oldset, regs);
359 329
@@ -377,15 +347,16 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
377 * handler too. 347 * handler too.
378 */ 348 */
379 regs->flags &= ~X86_EFLAGS_TF; 349 regs->flags &= ~X86_EFLAGS_TF;
380 if (test_thread_flag(TIF_SINGLESTEP))
381 ptrace_notify(SIGTRAP);
382 350
383 spin_lock_irq(&current->sighand->siglock); 351 spin_lock_irq(&current->sighand->siglock);
384 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask); 352 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
385 if (!(ka->sa.sa_flags & SA_NODEFER)) 353 if (!(ka->sa.sa_flags & SA_NODEFER))
386 sigaddset(&current->blocked,sig); 354 sigaddset(&current->blocked, sig);
387 recalc_sigpending(); 355 recalc_sigpending();
388 spin_unlock_irq(&current->sighand->siglock); 356 spin_unlock_irq(&current->sighand->siglock);
357
358 tracehook_signal_handler(sig, info, ka, regs,
359 test_thread_flag(TIF_SINGLESTEP));
389 } 360 }
390 361
391 return ret; 362 return ret;
@@ -442,9 +413,9 @@ static void do_signal(struct pt_regs *regs)
442 } 413 }
443 414
444 /* Did we come from a system call? */ 415 /* Did we come from a system call? */
445 if (current_syscall(regs) >= 0) { 416 if (syscall_get_nr(current, regs) >= 0) {
446 /* Restart the system call - no handlers present */ 417 /* Restart the system call - no handlers present */
447 switch (current_syscall_ret(regs)) { 418 switch (syscall_get_error(current, regs)) {
448 case -ERESTARTNOHAND: 419 case -ERESTARTNOHAND:
449 case -ERESTARTSYS: 420 case -ERESTARTSYS:
450 case -ERESTARTNOINTR: 421 case -ERESTARTNOINTR:
@@ -482,17 +453,23 @@ void do_notify_resume(struct pt_regs *regs, void *unused,
482 /* deal with pending signal delivery */ 453 /* deal with pending signal delivery */
483 if (thread_info_flags & _TIF_SIGPENDING) 454 if (thread_info_flags & _TIF_SIGPENDING)
484 do_signal(regs); 455 do_signal(regs);
456
457 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
458 clear_thread_flag(TIF_NOTIFY_RESUME);
459 tracehook_notify_resume(regs);
460 }
485} 461}
486 462
487void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 463void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
488{ 464{
489 struct task_struct *me = current; 465 struct task_struct *me = current;
490 if (show_unhandled_signals && printk_ratelimit()) { 466 if (show_unhandled_signals && printk_ratelimit()) {
491 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 467 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
492 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 468 me->comm, me->pid, where, frame, regs->ip,
469 regs->sp, regs->orig_ax);
493 print_vma_addr(" in ", regs->ip); 470 print_vma_addr(" in ", regs->ip);
494 printk("\n"); 471 printk("\n");
495 } 472 }
496 473
497 force_sig(SIGSEGV, me); 474 force_sig(SIGSEGV, me);
498} 475}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index aa804c64b167..9056f7e272c0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -258,6 +258,7 @@ static void __cpuinit smp_callin(void)
258 end_local_APIC_setup(); 258 end_local_APIC_setup();
259 map_cpu_to_logical_apicid(); 259 map_cpu_to_logical_apicid();
260 260
261 notify_cpu_starting(cpuid);
261 /* 262 /*
262 * Get our bogomips. 263 * Get our bogomips.
263 * 264 *
@@ -1313,16 +1314,13 @@ __init void prefill_possible_map(void)
1313 if (!num_processors) 1314 if (!num_processors)
1314 num_processors = 1; 1315 num_processors = 1;
1315 1316
1316#ifdef CONFIG_HOTPLUG_CPU
1317 if (additional_cpus == -1) { 1317 if (additional_cpus == -1) {
1318 if (disabled_cpus > 0) 1318 if (disabled_cpus > 0)
1319 additional_cpus = disabled_cpus; 1319 additional_cpus = disabled_cpus;
1320 else 1320 else
1321 additional_cpus = 0; 1321 additional_cpus = 0;
1322 } 1322 }
1323#else 1323
1324 additional_cpus = 0;
1325#endif
1326 possible = num_processors + additional_cpus; 1324 possible = num_processors + additional_cpus;
1327 if (possible > NR_CPUS) 1325 if (possible > NR_CPUS)
1328 possible = NR_CPUS; 1326 possible = NR_CPUS;
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index c9288c883e20..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,16 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h> 19#include <asm/syscalls.h>
20 20
21asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
22 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
23{ 24{
24 long error; 25 long error;
25 struct file * file; 26 struct file *file;
26 27
27 error = -EINVAL; 28 error = -EINVAL;
28 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -57,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
57 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
58 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
59 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
60 of playground for now. -AK */ 61 of playground for now. -AK */
61 *begin = 0x40000000; 62 *begin = 0x40000000;
62 *end = 0x80000000; 63 *end = 0x80000000;
63 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
64 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
65 if (new_begin) 66 if (new_begin)
@@ -67,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
67 } 68 }
68 } else { 69 } else {
69 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
70 *end = TASK_SIZE; 71 *end = TASK_SIZE;
71 } 72 }
72} 73}
73 74
74unsigned long 75unsigned long
75arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -79,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
79 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
80 unsigned long start_addr; 81 unsigned long start_addr;
81 unsigned long begin, end; 82 unsigned long begin, end;
82 83
83 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
84 return addr; 85 return addr;
85 86
86 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
87 88
88 if (len > end) 89 if (len > end)
89 return -ENOMEM; 90 return -ENOMEM;
@@ -97,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
97 } 98 }
98 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
99 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
100 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
101 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
102 } 103 }
103 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
104 if (addr < begin) 105 if (addr < begin)
105 addr = begin; 106 addr = begin;
106 start_addr = addr; 107 start_addr = addr;
107 108
108full_search: 109full_search:
@@ -128,7 +129,7 @@ full_search:
128 return addr; 129 return addr;
129 } 130 }
130 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
131 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
132 133
133 addr = vma->vm_end; 134 addr = vma->vm_end;
134 } 135 }
@@ -178,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
178 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
179 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
180 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
181 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
182 } 183 }
183 184
184 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -195,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
195 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
196 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
197 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
198 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
199 200
200 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
201 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -225,13 +226,13 @@ bottomup:
225} 226}
226 227
227 228
228asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
229{ 230{
230 int err; 231 int err;
231 down_read(&uts_sem); 232 down_read(&uts_sem);
232 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
233 up_read(&uts_sem); 234 up_read(&uts_sem);
234 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
235 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
236 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
237} 238}
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index b42068fb7b76..2887a789e38f 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -32,6 +32,8 @@
32#include <linux/bug.h> 32#include <linux/bug.h>
33#include <linux/nmi.h> 33#include <linux/nmi.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/smp.h>
36#include <linux/io.h>
35 37
36#if defined(CONFIG_EDAC) 38#if defined(CONFIG_EDAC)
37#include <linux/edac.h> 39#include <linux/edac.h>
@@ -45,9 +47,6 @@
45#include <asm/unwind.h> 47#include <asm/unwind.h>
46#include <asm/desc.h> 48#include <asm/desc.h>
47#include <asm/i387.h> 49#include <asm/i387.h>
48#include <asm/nmi.h>
49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h> 50#include <asm/pgalloc.h>
52#include <asm/proto.h> 51#include <asm/proto.h>
53#include <asm/pda.h> 52#include <asm/pda.h>
@@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
85 84
86void printk_address(unsigned long address, int reliable) 85void printk_address(unsigned long address, int reliable)
87{ 86{
88 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); 87 printk(" [<%016lx>] %s%pS\n",
88 address, reliable ? "" : "? ", (void *) address);
89} 89}
90 90
91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
98 [STACKFAULT_STACK - 1] = "#SS", 98 [STACKFAULT_STACK - 1] = "#SS",
99 [MCE_STACK - 1] = "#MC", 99 [MCE_STACK - 1] = "#MC",
100#if DEBUG_STKSZ > EXCEPTION_STKSZ 100#if DEBUG_STKSZ > EXCEPTION_STKSZ
101 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 101 [N_EXCEPTION_STACKS ...
102 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
102#endif 103#endif
103 }; 104 };
104 unsigned k; 105 unsigned k;
@@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
163} 164}
164 165
165/* 166/*
166 * x86-64 can have up to three kernel stacks: 167 * x86-64 can have up to three kernel stacks:
167 * process stack 168 * process stack
168 * interrupt stack 169 * interrupt stack
169 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 170 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
@@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
219 const struct stacktrace_ops *ops, void *data) 220 const struct stacktrace_ops *ops, void *data)
220{ 221{
221 const unsigned cpu = get_cpu(); 222 const unsigned cpu = get_cpu();
222 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; 223 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
223 unsigned used = 0; 224 unsigned used = 0;
224 struct thread_info *tinfo; 225 struct thread_info *tinfo;
225 226
@@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
237 if (!bp) { 238 if (!bp) {
238 if (task == current) { 239 if (task == current) {
239 /* Grab bp right from our regs */ 240 /* Grab bp right from our regs */
240 asm("movq %%rbp, %0" : "=r" (bp) :); 241 asm("movq %%rbp, %0" : "=r" (bp) : );
241 } else { 242 } else {
242 /* bp is the last reg pushed by switch_to */ 243 /* bp is the last reg pushed by switch_to */
243 bp = *(unsigned long *) task->thread.sp; 244 bp = *(unsigned long *) task->thread.sp;
@@ -356,11 +357,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
356 unsigned long *stack; 357 unsigned long *stack;
357 int i; 358 int i;
358 const int cpu = smp_processor_id(); 359 const int cpu = smp_processor_id();
359 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); 360 unsigned long *irqstack_end =
360 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 361 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
362 unsigned long *irqstack =
363 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
361 364
362 // debugging aid: "show_stack(NULL, NULL);" prints the 365 /*
363 // back trace for this cpu. 366 * debugging aid: "show_stack(NULL, NULL);" prints the
367 * back trace for this cpu.
368 */
364 369
365 if (sp == NULL) { 370 if (sp == NULL) {
366 if (task) 371 if (task)
@@ -404,7 +409,7 @@ void dump_stack(void)
404 409
405#ifdef CONFIG_FRAME_POINTER 410#ifdef CONFIG_FRAME_POINTER
406 if (!bp) 411 if (!bp)
407 asm("movq %%rbp, %0" : "=r" (bp):); 412 asm("movq %%rbp, %0" : "=r" (bp) : );
408#endif 413#endif
409 414
410 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 415 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
@@ -414,7 +419,6 @@ void dump_stack(void)
414 init_utsname()->version); 419 init_utsname()->version);
415 show_trace(NULL, NULL, &stack, bp); 420 show_trace(NULL, NULL, &stack, bp);
416} 421}
417
418EXPORT_SYMBOL(dump_stack); 422EXPORT_SYMBOL(dump_stack);
419 423
420void show_registers(struct pt_regs *regs) 424void show_registers(struct pt_regs *regs)
@@ -492,7 +496,7 @@ unsigned __kprobes long oops_begin(void)
492 raw_local_irq_save(flags); 496 raw_local_irq_save(flags);
493 cpu = smp_processor_id(); 497 cpu = smp_processor_id();
494 if (!__raw_spin_trylock(&die_lock)) { 498 if (!__raw_spin_trylock(&die_lock)) {
495 if (cpu == die_owner) 499 if (cpu == die_owner)
496 /* nested oops. should stop eventually */; 500 /* nested oops. should stop eventually */;
497 else 501 else
498 __raw_spin_lock(&die_lock); 502 __raw_spin_lock(&die_lock);
@@ -637,7 +641,7 @@ kernel_trap:
637} 641}
638 642
639#define DO_ERROR(trapnr, signr, str, name) \ 643#define DO_ERROR(trapnr, signr, str, name) \
640asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 644asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
641{ \ 645{ \
642 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 646 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
643 == NOTIFY_STOP) \ 647 == NOTIFY_STOP) \
@@ -647,7 +651,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
647} 651}
648 652
649#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 653#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
650asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ 654asmlinkage void do_##name(struct pt_regs *regs, long error_code) \
651{ \ 655{ \
652 siginfo_t info; \ 656 siginfo_t info; \
653 info.si_signo = signr; \ 657 info.si_signo = signr; \
@@ -682,7 +686,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
682 preempt_conditional_cli(regs); 686 preempt_conditional_cli(regs);
683} 687}
684 688
685asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) 689asmlinkage void do_double_fault(struct pt_regs *regs, long error_code)
686{ 690{
687 static const char str[] = "double fault"; 691 static const char str[] = "double fault";
688 struct task_struct *tsk = current; 692 struct task_struct *tsk = current;
@@ -777,9 +781,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
777} 781}
778 782
779static notrace __kprobes void 783static notrace __kprobes void
780unknown_nmi_error(unsigned char reason, struct pt_regs * regs) 784unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
781{ 785{
782 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 786 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
787 NOTIFY_STOP)
783 return; 788 return;
784 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", 789 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
785 reason); 790 reason);
@@ -881,7 +886,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
881 else if (user_mode(eregs)) 886 else if (user_mode(eregs))
882 regs = task_pt_regs(current); 887 regs = task_pt_regs(current);
883 /* Exception from kernel and interrupts are enabled. Move to 888 /* Exception from kernel and interrupts are enabled. Move to
884 kernel process stack. */ 889 kernel process stack. */
885 else if (eregs->flags & X86_EFLAGS_IF) 890 else if (eregs->flags & X86_EFLAGS_IF)
886 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); 891 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
887 if (eregs != regs) 892 if (eregs != regs)
@@ -890,7 +895,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
890} 895}
891 896
892/* runs on IST stack. */ 897/* runs on IST stack. */
893asmlinkage void __kprobes do_debug(struct pt_regs * regs, 898asmlinkage void __kprobes do_debug(struct pt_regs *regs,
894 unsigned long error_code) 899 unsigned long error_code)
895{ 900{
896 struct task_struct *tsk = current; 901 struct task_struct *tsk = current;
@@ -1034,7 +1039,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
1034 1039
1035asmlinkage void bad_intr(void) 1040asmlinkage void bad_intr(void)
1036{ 1041{
1037 printk("bad interrupt"); 1042 printk("bad interrupt");
1038} 1043}
1039 1044
1040asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1045asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
@@ -1046,7 +1051,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1046 1051
1047 conditional_sti(regs); 1052 conditional_sti(regs);
1048 if (!user_mode(regs) && 1053 if (!user_mode(regs) &&
1049 kernel_math_error(regs, "kernel simd math error", 19)) 1054 kernel_math_error(regs, "kernel simd math error", 19))
1050 return; 1055 return;
1051 1056
1052 /* 1057 /*
@@ -1091,7 +1096,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1091 force_sig_info(SIGFPE, &info, task); 1096 force_sig_info(SIGFPE, &info, task);
1092} 1097}
1093 1098
1094asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) 1099asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs)
1095{ 1100{
1096} 1101}
1097 1102
@@ -1148,8 +1153,10 @@ void __init trap_init(void)
1148 set_intr_gate(0, &divide_error); 1153 set_intr_gate(0, &divide_error);
1149 set_intr_gate_ist(1, &debug, DEBUG_STACK); 1154 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1150 set_intr_gate_ist(2, &nmi, NMI_STACK); 1155 set_intr_gate_ist(2, &nmi, NMI_STACK);
1151 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ 1156 /* int3 can be called from all */
1152 set_system_gate(4, &overflow); /* int4 can be called from all */ 1157 set_system_gate_ist(3, &int3, DEBUG_STACK);
1158 /* int4 can be called from all */
1159 set_system_gate(4, &overflow);
1153 set_intr_gate(5, &bounds); 1160 set_intr_gate(5, &bounds);
1154 set_intr_gate(6, &invalid_op); 1161 set_intr_gate(6, &invalid_op);
1155 set_intr_gate(7, &device_not_available); 1162 set_intr_gate(7, &device_not_available);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 346cae5ac423..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
123} 123}
124 124
125/* 125/*
126 * Calculate the TSC frequency from HPET reference
127 */
128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{
130 u64 tmp;
131
132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
148
149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
126 * Try to calibrate the TSC against the Programmable 172 * Try to calibrate the TSC against the Programmable
127 * Interrupt Timer and return the frequency of the TSC 173 * Interrupt Timer and return the frequency of the TSC
128 * in kHz. 174 * in kHz.
129 * 175 *
130 * Return ULONG_MAX on failure to calibrate. 176 * Return ULONG_MAX on failure to calibrate.
131 */ 177 */
132static unsigned long pit_calibrate_tsc(void) 178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
133{ 179{
134 u64 tsc, t1, t2, delta; 180 u64 tsc, t1, t2, delta;
135 unsigned long tscmin, tscmax; 181 unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
144 * (LSB then MSB) to begin countdown. 190 * (LSB then MSB) to begin countdown.
145 */ 191 */
146 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
147 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
148 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
149 195
150 tsc = t1 = t2 = get_cycles(); 196 tsc = t1 = t2 = get_cycles();
151 197
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
166 /* 212 /*
167 * Sanity checks: 213 * Sanity checks:
168 * 214 *
169 * If we were not able to read the PIT more than 5000 215 * If we were not able to read the PIT more than loopmin
170 * times, then we have been hit by a massive SMI 216 * times, then we have been hit by a massive SMI
171 * 217 *
172 * If the maximum is 10 times larger than the minimum, 218 * If the maximum is 10 times larger than the minimum,
173 * then we got hit by an SMI as well. 219 * then we got hit by an SMI as well.
174 */ 220 */
175 if (pitcnt < 5000 || tscmax > 10 * tscmin) 221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
176 return ULONG_MAX; 222 return ULONG_MAX;
177 223
178 /* Calculate the PIT value */ 224 /* Calculate the PIT value */
179 delta = t2 - t1; 225 delta = t2 - t1;
180 do_div(delta, 50); 226 do_div(delta, ms);
181 return delta; 227 return delta;
182} 228}
183 229
230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
184 347
185/** 348/**
186 * native_calibrate_tsc - calibrate the tsc on boot 349 * native_calibrate_tsc - calibrate the tsc on boot
187 */ 350 */
188unsigned long native_calibrate_tsc(void) 351unsigned long native_calibrate_tsc(void)
189{ 352{
190 u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; 353 u64 tsc1, tsc2, delta, ref1, ref2;
191 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
192 unsigned long flags; 355 unsigned long flags, latch, ms, fast_calibrate;
193 int hpet = is_hpet_enabled(), i; 356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
194 363
195 /* 364 /*
196 * Run 5 calibration loops to get the lowest frequency value 365 * Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
216 * calibration delay loop as we have to wait for a certain 385 * calibration delay loop as we have to wait for a certain
217 * amount of time anyway. 386 * amount of time anyway.
218 */ 387 */
219 for (i = 0; i < 5; i++) { 388
389 /* Preset PIT loop values */
390 latch = CAL_LATCH;
391 ms = CAL_MS;
392 loopmin = CAL_PIT_LOOPS;
393
394 for (i = 0; i < 3; i++) {
220 unsigned long tsc_pit_khz; 395 unsigned long tsc_pit_khz;
221 396
222 /* 397 /*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
226 * read the end value. 401 * read the end value.
227 */ 402 */
228 local_irq_save(flags); 403 local_irq_save(flags);
229 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 404 tsc1 = tsc_read_refs(&ref1, hpet);
230 tsc_pit_khz = pit_calibrate_tsc(); 405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
231 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 406 tsc2 = tsc_read_refs(&ref2, hpet);
232 local_irq_restore(flags); 407 local_irq_restore(flags);
233 408
234 /* Pick the lowest PIT TSC calibration so far */ 409 /* Pick the lowest PIT TSC calibration so far */
235 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
236 411
237 /* hpet or pmtimer available ? */ 412 /* hpet or pmtimer available ? */
238 if (!hpet && !pm1 && !pm2) 413 if (!hpet && !ref1 && !ref2)
239 continue; 414 continue;
240 415
241 /* Check, whether the sampling was disturbed by an SMI */ 416 /* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
243 continue; 418 continue;
244 419
245 tsc2 = (tsc2 - tsc1) * 1000000LL; 420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
246 425
247 if (hpet) { 426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
248 if (hpet2 < hpet1) 427
249 hpet2 += 0x100000000ULL; 428 /* Check the reference deviation */
250 hpet2 -= hpet1; 429 delta = ((u64) tsc_pit_min) * 100;
251 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); 430 do_div(delta, tsc_ref_min);
252 do_div(tsc1, 1000000); 431
253 } else { 432 /*
254 if (pm2 < pm1) 433 * If both calibration results are inside a 10% window
255 pm2 += (u64)ACPI_PM_OVRRUN; 434 * then we can be sure, that the calibration
256 pm2 -= pm1; 435 * succeeded. We break out of the loop right away. We
257 tsc1 = pm2 * 1000000000LL; 436 * use the reference value, as it is more precise.
258 do_div(tsc1, PMTMR_TICKS_PER_SEC); 437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
259 } 443 }
260 444
261 do_div(tsc2, tsc1); 445 /*
262 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); 446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
263 } 456 }
264 457
265 /* 458 /*
@@ -267,11 +460,10 @@ unsigned long native_calibrate_tsc(void)
267 */ 460 */
268 if (tsc_pit_min == ULONG_MAX) { 461 if (tsc_pit_min == ULONG_MAX) {
269 /* PIT gave no useful value */ 462 /* PIT gave no useful value */
270 printk(KERN_WARNING "TSC: PIT calibration failed due to " 463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
271 "SMI disturbance.\n");
272 464
273 /* We don't have an alternative source, disable TSC */ 465 /* We don't have an alternative source, disable TSC */
274 if (!hpet && !pm1 && !pm2) { 466 if (!hpet && !ref1 && !ref2) {
275 printk("TSC: No reference (HPET/PMTIMER) available\n"); 467 printk("TSC: No reference (HPET/PMTIMER) available\n");
276 return 0; 468 return 0;
277 } 469 }
@@ -279,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
279 /* The alternative source failed as well, disable TSC */ 471 /* The alternative source failed as well, disable TSC */
280 if (tsc_ref_min == ULONG_MAX) { 472 if (tsc_ref_min == ULONG_MAX) {
281 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " 473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
282 "failed due to SMI disturbance.\n"); 474 "failed.\n");
283 return 0; 475 return 0;
284 } 476 }
285 477
@@ -291,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
291 } 483 }
292 484
293 /* We don't have an alternative source, use the PIT calibration value */ 485 /* We don't have an alternative source, use the PIT calibration value */
294 if (!hpet && !pm1 && !pm2) { 486 if (!hpet && !ref1 && !ref2) {
295 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
296 return tsc_pit_min; 488 return tsc_pit_min;
297 } 489 }
298 490
299 /* The alternative source failed, use the PIT calibration value */ 491 /* The alternative source failed, use the PIT calibration value */
300 if (tsc_ref_min == ULONG_MAX) { 492 if (tsc_ref_min == ULONG_MAX) {
301 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " 493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
302 "to SMI disturbance. Using PIT calibration\n"); 494 "Using PIT calibration\n");
303 return tsc_pit_min; 495 return tsc_pit_min;
304 } 496 }
305 497
306 /* Check the reference deviation */
307 delta = ((u64) tsc_pit_min) * 100;
308 do_div(delta, tsc_ref_min);
309
310 /*
311 * If both calibration results are inside a 5% window, the we
312 * use the lower frequency of those as it is probably the
313 * closest estimate.
314 */
315 if (delta >= 95 && delta <= 105) {
316 printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
317 hpet ? "HPET" : "PMTIMER");
318 printk(KERN_INFO "TSC: using %s calibration value\n",
319 tsc_pit_min <= tsc_ref_min ? "PIT" :
320 hpet ? "HPET" : "PMTIMER");
321 return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
322 }
323
324 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
325 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
326
327 /* 498 /*
328 * The calibration values differ too much. In doubt, we use 499 * The calibration values differ too much. In doubt, we use
329 * the PIT value as we know that there are PMTIMERs around 500 * the PIT value as we know that there are PMTIMERs around
330 * running at double speed. 501 * running at double speed. At least we let the user know:
331 */ 502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
332 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
333 return tsc_pit_min; 506 return tsc_pit_min;
334} 507}
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 594ef47f0a63..61a97e616f70 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h> 27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
28#include <asm/fixmap.h> 29#include <asm/fixmap.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/e820.h> 32#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <mach_ipi.h>
36 36
37#include "mach_apic.h" 37#include "mach_apic.h"
38 38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45 40
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h> 41#include <asm/i8259.h>
49#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h> 43#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53 44
54#include <linux/sched.h> 45#include <linux/sched.h>
55#include <linux/kernel.h> 46#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h> 47#include <linux/pci.h>
58#include <linux/pci_ids.h> 48#include <linux/pci_ids.h>
59 49
60extern int no_broadcast; 50extern int no_broadcast;
61 51
62#include <asm/io.h>
63#include <asm/apic.h> 52#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67 53
68char visws_board_type = -1; 54char visws_board_type = -1;
69char visws_board_rev = -1; 55char visws_board_rev = -1;
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 61531d5c9507..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
235 const void *desc) 235 const void *desc)
236{ 236{
237 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
238 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
239} 239}
240 240
241static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
393} 393}
394#endif 394#endif
395 395
396static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 397{
398 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 400}
401 401
402static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
403{ 403{
404 /* 404 /*
405 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 411}
412 412
413static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 414{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 418}
419 419
420static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
421{ 421{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 424}
425 425
426static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
427{ 427{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {