aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/alternative.c1
-rw-r--r--arch/x86/kernel/amd_iommu.c12
-rw-r--r--arch/x86/kernel/amd_iommu_init.c67
-rw-r--r--arch/x86/kernel/apb_timer.c37
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile7
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/es7000_32.c1
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c15
-rw-r--r--arch/x86/kernel/apic/nmi.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c10
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c30
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c20
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c1
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c108
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c213
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c133
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c96
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c171
-rw-r--r--arch/x86/kernel/cpu/scattered.c64
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)58
-rw-r--r--arch/x86/kernel/cpu/vmware.c9
-rw-r--r--arch/x86/kernel/crash.c3
-rw-r--r--arch/x86/kernel/dumpstack.c1
-rw-r--r--arch/x86/kernel/dumpstack.h56
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/entry_32.S16
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/head_32.S14
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c48
-rw-r--r--arch/x86/kernel/hw_breakpoint.c79
-rw-r--r--arch/x86/kernel/i387.c40
-rw-r--r--arch/x86/kernel/kgdb.c191
-rw-r--r--arch/x86/kernel/kprobes.c58
-rw-r--r--arch/x86/kernel/module.c3
-rw-r--r--arch/x86/kernel/mpparse.c16
-rw-r--r--arch/x86/kernel/mrst.c105
-rw-r--r--arch/x86/kernel/olpc.c20
-rw-r--r--arch/x86/kernel/olpc_ofw.c106
-rw-r--r--arch/x86/kernel/pci-dma.c7
-rw-r--r--arch/x86/kernel/process.c50
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/setup.c8
-rw-r--r--arch/x86/kernel/smpboot.c66
-rw-r--r--arch/x86/kernel/stacktrace.c31
-rw-r--r--arch/x86/kernel/sys_i386_32.c4
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/tlb_uv.c760
-rw-r--r--arch/x86/kernel/trampoline.c17
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c43
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/xsave.c195
76 files changed, 2329 insertions, 1142 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77b22083721..fedf32a8c3ec 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_pvclock.o = -pg
15CFLAGS_REMOVE_kvmclock.o = -pg
14CFLAGS_REMOVE_ftrace.o = -pg 16CFLAGS_REMOVE_ftrace.o = -pg
15CFLAGS_REMOVE_early_printk.o = -pg 17CFLAGS_REMOVE_early_printk.o = -pg
16endif 18endif
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o
104scx200-y += scx200_32.o 106scx200-y += scx200_32.o
105 107
106obj-$(CONFIG_OLPC) += olpc.o 108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
107obj-$(CONFIG_X86_MRST) += mrst.o 110obj-$(CONFIG_X86_MRST) += mrst.o
108 111
109microcode-y := microcode_core.o 112microcode-y := microcode_core.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb7a5f052e2b..fb16f17e59be 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -61,7 +61,7 @@ struct cstate_entry {
61 unsigned int ecx; 61 unsigned int ecx;
62 } states[ACPI_PROCESSOR_MAX_POWER]; 62 } states[ACPI_PROCESSOR_MAX_POWER];
63}; 63};
64static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */ 64static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
65 65
66static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 66static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
67 67
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
104 movl %eax, %ecx 104 movl %eax, %ecx
105 orl %edx, %ecx 105 orl %edx, %ecx
106 jz 1f 106 jz 1f
107 movl $0xc0000080, %ecx 107 movl $MSR_EFER, %ecx
108 wrmsr 108 wrmsr
1091: 1091:
110 110
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 70237732a6c7..f65ab8b014c4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
214 u8 *instr = a->instr; 214 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 215 BUG_ON(a->replacementlen > a->instrlen);
216 BUG_ON(a->instrlen > sizeof(insnbuf)); 216 BUG_ON(a->instrlen > sizeof(insnbuf));
217 BUG_ON(a->cpuid >= NCAPINTS*32);
217 if (!boot_cpu_has(a->cpuid)) 218 if (!boot_cpu_has(a->cpuid))
218 continue; 219 continue;
219#ifdef CONFIG_X86_64 220#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0d20286d78c6..679b6450382b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1953,6 +1953,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
1953 size_t size, 1953 size_t size,
1954 int dir) 1954 int dir)
1955{ 1955{
1956 dma_addr_t flush_addr;
1956 dma_addr_t i, start; 1957 dma_addr_t i, start;
1957 unsigned int pages; 1958 unsigned int pages;
1958 1959
@@ -1960,6 +1961,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
1960 (dma_addr + size > dma_dom->aperture_size)) 1961 (dma_addr + size > dma_dom->aperture_size))
1961 return; 1962 return;
1962 1963
1964 flush_addr = dma_addr;
1963 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); 1965 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
1964 dma_addr &= PAGE_MASK; 1966 dma_addr &= PAGE_MASK;
1965 start = dma_addr; 1967 start = dma_addr;
@@ -1974,7 +1976,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
1974 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1976 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1975 1977
1976 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1978 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1977 iommu_flush_pages(&dma_dom->domain, dma_addr, size); 1979 iommu_flush_pages(&dma_dom->domain, flush_addr, size);
1978 dma_dom->need_flush = false; 1980 dma_dom->need_flush = false;
1979 } 1981 }
1980} 1982}
@@ -2572,6 +2574,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2572static int amd_iommu_domain_has_cap(struct iommu_domain *domain, 2574static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
2573 unsigned long cap) 2575 unsigned long cap)
2574{ 2576{
2577 switch (cap) {
2578 case IOMMU_CAP_CACHE_COHERENCY:
2579 return 1;
2580 }
2581
2575 return 0; 2582 return 0;
2576} 2583}
2577 2584
@@ -2609,8 +2616,7 @@ int __init amd_iommu_init_passthrough(void)
2609 2616
2610 pt_domain->mode |= PAGE_MODE_NONE; 2617 pt_domain->mode |= PAGE_MODE_NONE;
2611 2618
2612 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2619 for_each_pci_dev(dev) {
2613
2614 if (!check_device(&dev->dev)) 2620 if (!check_device(&dev->dev))
2615 continue; 2621 continue;
2616 2622
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3cc63e2b8dd4..5a170cbbbed8 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -632,6 +632,13 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
632 iommu->last_device = calc_devid(MMIO_GET_BUS(range), 632 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
633 MMIO_GET_LD(range)); 633 MMIO_GET_LD(range));
634 iommu->evt_msi_num = MMIO_MSI_NUM(misc); 634 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
635
636 if (is_rd890_iommu(iommu->dev)) {
637 pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]);
638 pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]);
639 pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]);
640 pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]);
641 }
635} 642}
636 643
637/* 644/*
@@ -649,29 +656,9 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
649 struct ivhd_entry *e; 656 struct ivhd_entry *e;
650 657
651 /* 658 /*
652 * First set the recommended feature enable bits from ACPI 659 * First save the recommended feature enable bits from ACPI
653 * into the IOMMU control registers
654 */ 660 */
655 h->flags & IVHD_FLAG_HT_TUN_EN_MASK ? 661 iommu->acpi_flags = h->flags;
656 iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
657 iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
658
659 h->flags & IVHD_FLAG_PASSPW_EN_MASK ?
660 iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
661 iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
662
663 h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
664 iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
665 iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
666
667 h->flags & IVHD_FLAG_ISOC_EN_MASK ?
668 iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
669 iommu_feature_disable(iommu, CONTROL_ISOC_EN);
670
671 /*
672 * make IOMMU memory accesses cache coherent
673 */
674 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
675 662
676 /* 663 /*
677 * Done. Now parse the device entries 664 * Done. Now parse the device entries
@@ -1116,6 +1103,40 @@ static void init_device_table(void)
1116 } 1103 }
1117} 1104}
1118 1105
1106static void iommu_init_flags(struct amd_iommu *iommu)
1107{
1108 iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
1109 iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
1110 iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
1111
1112 iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
1113 iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
1114 iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
1115
1116 iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
1117 iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
1118 iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
1119
1120 iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
1121 iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
1122 iommu_feature_disable(iommu, CONTROL_ISOC_EN);
1123
1124 /*
1125 * make IOMMU memory accesses cache coherent
1126 */
1127 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1128}
1129
1130static void iommu_apply_quirks(struct amd_iommu *iommu)
1131{
1132 if (is_rd890_iommu(iommu->dev)) {
1133 pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]);
1134 pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]);
1135 pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]);
1136 pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]);
1137 }
1138}
1139
1119/* 1140/*
1120 * This function finally enables all IOMMUs found in the system after 1141 * This function finally enables all IOMMUs found in the system after
1121 * they have been initialized 1142 * they have been initialized
@@ -1126,6 +1147,8 @@ static void enable_iommus(void)
1126 1147
1127 for_each_iommu(iommu) { 1148 for_each_iommu(iommu) {
1128 iommu_disable(iommu); 1149 iommu_disable(iommu);
1150 iommu_apply_quirks(iommu);
1151 iommu_init_flags(iommu);
1129 iommu_set_device_table(iommu); 1152 iommu_set_device_table(iommu);
1130 iommu_enable_command_buffer(iommu); 1153 iommu_enable_command_buffer(iommu);
1131 iommu_enable_event_buffer(iommu); 1154 iommu_enable_event_buffer(iommu);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..8dd77800ff5d 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
43 43
44#include <asm/fixmap.h> 44#include <asm/fixmap.h>
45#include <asm/apb_timer.h> 45#include <asm/apb_timer.h>
46#include <asm/mrst.h>
46 47
47#define APBT_MASK CLOCKSOURCE_MASK(32) 48#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22 49#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150 50#define APBT_CLOCKEVENT_RATING 110
50#define APBT_CLOCKSOURCE_RATING 250 51#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200 52#define APBT_MIN_DELTA_USEC 200
52 53
@@ -83,8 +84,6 @@ struct apbt_dev {
83 char name[10]; 84 char name[10];
84}; 85};
85 86
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); 87static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89 88
90#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
195}; 194};
196 195
197/* 196/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit 197 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0. 198 * then load initial load count to ~0.
223 */ 199 */
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
335 adev->num = smp_processor_id(); 311 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 312 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337 313
338 if (disable_apbt_percpu) { 314 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 315 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt; 316 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n", 317 printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
429 405
430static __init int apbt_late_init(void) 406static __init int apbt_late_init(void)
431{ 407{
432 if (disable_apbt_percpu || !apb_timer_block_enabled) 408 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
409 !apb_timer_block_enabled)
433 return 0; 410 return 0;
434 /* This notifier should be called after workqueue is ready */ 411 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20); 412 hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
450 int timer_num; 427 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); 428 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452 429
430 BUG_ON(!apbt_virt_address);
431
453 timer_num = adev->num; 432 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n", 433 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode); 434 __func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
676 } 655 }
677#ifdef CONFIG_SMP 656#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */ 657 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) { 658 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n"); 659 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return; 660 return;
682 } 661 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..a2e0caf26e17 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
280 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
281 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
282 */ 282 */
283 u32 agp_aper_base = 0, agp_aper_order = 0; 283 u32 agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0; 284 int i, fix, slot, valid_agp = 0;
285 u32 ctl; 285 u32 ctl;
286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
291 return; 291 return;
292 292
293 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); 294 search_agp_bridge(&agp_aper_order, &valid_agp);
295 295
296 fix = 0; 296 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 11obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 12obj-$(CONFIG_SMP) += ipi.o
8 13
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 980508c79082..e3b534cda49a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1606,7 +1606,7 @@ void __init init_apic_mappings(void)
1606 * acpi lapic path already maps that address in 1606 * acpi lapic path already maps that address in
1607 * acpi_register_lapic_address() 1607 * acpi_register_lapic_address()
1608 */ 1608 */
1609 if (!acpi_lapic) 1609 if (!acpi_lapic && !smp_found_config)
1610 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1610 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1611 1611
1612 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", 1612 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 425e53a87feb..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,7 +129,6 @@ int es7000_plat;
129 * GSI override for ES7000 platforms. 129 * GSI override for ES7000 platforms.
130 */ 130 */
131 131
132static unsigned int base;
133 132
134static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) 133static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
135{ 134{
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
1/*
2 * HW NMI watchdog support
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * Arch specific calls to support NMI watchdog
7 *
8 * Bits copied from original nmi.c file
9 *
10 */
11#include <asm/apic.h>
12
13#include <linux/cpumask.h>
14#include <linux/kdebug.h>
15#include <linux/notifier.h>
16#include <linux/kprobes.h>
17#include <linux/nmi.h>
18#include <linux/module.h>
19
20/* For reliability, we're prepared to waste bits here. */
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22
23u64 hw_nmi_get_sample_period(void)
24{
25 return (u64)(cpu_khz) * 1000 * 60;
26}
27
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void)
30{
31 int i;
32
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34
35 printk(KERN_INFO "sending NMI to all CPUs:\n");
36 apic->send_IPI_all(NMI_VECTOR);
37
38 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
39 for (i = 0; i < 10 * 1000; i++) {
40 if (cpumask_empty(to_cpumask(backtrace_mask)))
41 break;
42 mdelay(1);
43 }
44}
45
46static int __kprobes
47arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
48 unsigned long cmd, void *__args)
49{
50 struct die_args *args = __args;
51 struct pt_regs *regs;
52 int cpu = smp_processor_id();
53
54 switch (cmd) {
55 case DIE_NMI:
56 case DIE_NMI_IPI:
57 break;
58
59 default:
60 return NOTIFY_DONE;
61 }
62
63 regs = args->regs;
64
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
67
68 arch_spin_lock(&lock);
69 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
70 show_regs(regs);
71 dump_stack();
72 arch_spin_unlock(&lock);
73 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
74 return NOTIFY_STOP;
75 }
76
77 return NOTIFY_DONE;
78}
79
80static __read_mostly struct notifier_block backtrace_notifier = {
81 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
82 .next = NULL,
83 .priority = 1
84};
85
86static int __init register_trigger_all_cpu_backtrace(void)
87{
88 register_die_notifier(&backtrace_notifier);
89 return 0;
90}
91early_initcall(register_trigger_all_cpu_backtrace);
92#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e41ed24ab26d..5c5b8f3dddb5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -306,14 +306,19 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
306 306
307 old_cfg = old_desc->chip_data; 307 old_cfg = old_desc->chip_data;
308 308
309 memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); 309 cfg->vector = old_cfg->vector;
310 cfg->move_in_progress = old_cfg->move_in_progress;
311 cpumask_copy(cfg->domain, old_cfg->domain);
312 cpumask_copy(cfg->old_domain, old_cfg->old_domain);
310 313
311 init_copy_irq_2_pin(old_cfg, cfg, node); 314 init_copy_irq_2_pin(old_cfg, cfg, node);
312} 315}
313 316
314static void free_irq_cfg(struct irq_cfg *old_cfg) 317static void free_irq_cfg(struct irq_cfg *cfg)
315{ 318{
316 kfree(old_cfg); 319 free_cpumask_var(cfg->domain);
320 free_cpumask_var(cfg->old_domain);
321 kfree(cfg);
317} 322}
318 323
319void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) 324void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -1728,6 +1733,8 @@ __apicdebuginit(void) print_IO_APIC(void)
1728 struct irq_pin_list *entry; 1733 struct irq_pin_list *entry;
1729 1734
1730 cfg = desc->chip_data; 1735 cfg = desc->chip_data;
1736 if (!cfg)
1737 continue;
1731 entry = cfg->irq_2_pin; 1738 entry = cfg->irq_2_pin;
1732 if (!entry) 1739 if (!entry)
1733 continue; 1740 continue;
@@ -3397,7 +3404,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3397 3404
3398 cfg = desc->chip_data; 3405 cfg = desc->chip_data;
3399 3406
3400 read_msi_msg_desc(desc, &msg); 3407 get_cached_msi_msg_desc(desc, &msg);
3401 3408
3402 msg.data &= ~MSI_DATA_VECTOR_MASK; 3409 msg.data &= ~MSI_DATA_VECTOR_MASK;
3403 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3410 msg.data |= MSI_DATA_VECTOR(cfg->vector);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..a43f71cb30f8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
401 int cpu = smp_processor_id(); 401 int cpu = smp_processor_id();
402 int rc = 0; 402 int rc = 0;
403 403
404 /* check for other users first */
405 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
406 == NOTIFY_STOP) {
407 rc = 1;
408 touched = 1;
409 }
410
411 sum = get_timer_irqs(cpu); 404 sum = get_timer_irqs(cpu);
412 405
413 if (__get_cpu_var(nmi_touch)) { 406 if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e46f98f36e31..f744f54cb248 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -604,6 +604,10 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{ 604{
605 if (reason != DIE_NMI_IPI) 605 if (reason != DIE_NMI_IPI)
606 return NOTIFY_OK; 606 return NOTIFY_OK;
607
608 if (in_crash_kexec)
609 /* do nothing if entering the crash kernel */
610 return NOTIFY_OK;
607 /* 611 /*
608 * Use a lock so only one cpu prints at a time 612 * Use a lock so only one cpu prints at a time
609 * to prevent intermixed output. 613 * to prevent intermixed output.
@@ -694,9 +698,11 @@ void __init uv_system_init(void)
694 for (j = 0; j < 64; j++) { 698 for (j = 0; j < 64; j++) {
695 if (!test_bit(j, &present)) 699 if (!test_bit(j, &present))
696 continue; 700 continue;
697 uv_blade_info[blade].pnode = (i * 64 + j); 701 pnode = (i * 64 + j);
702 uv_blade_info[blade].pnode = pnode;
698 uv_blade_info[blade].nr_possible_cpus = 0; 703 uv_blade_info[blade].nr_possible_cpus = 0;
699 uv_blade_info[blade].nr_online_cpus = 0; 704 uv_blade_info[blade].nr_online_cpus = 0;
705 max_pnode = max(pnode, max_pnode);
700 blade++; 706 blade++;
701 } 707 }
702 } 708 }
@@ -734,7 +740,6 @@ void __init uv_system_init(void)
734 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); 740 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
735 uv_node_to_blade[nid] = blade; 741 uv_node_to_blade[nid] = blade;
736 uv_cpu_to_blade[cpu] = blade; 742 uv_cpu_to_blade[cpu] = blade;
737 max_pnode = max(pnode, max_pnode);
738 } 743 }
739 744
740 /* Add blade/pnode info for nodes without cpus */ 745 /* Add blade/pnode info for nodes without cpus */
@@ -746,7 +751,6 @@ void __init uv_system_init(void)
746 pnode = (paddr >> m_val) & pnode_mask; 751 pnode = (paddr >> m_val) & pnode_mask;
747 blade = boot_pnode_to_blade(pnode); 752 blade = boot_pnode_to_blade(pnode);
748 uv_node_to_blade[nid] = blade; 753 uv_node_to_blade[nid] = blade;
749 max_pnode = max(pnode, max_pnode);
750 } 754 }
751 755
752 map_gru_high(max_pnode); 756 map_gru_high(max_pnode);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3a785da34b6f..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
12nostackp := $(call cc-option, -fno-stack-protector) 12nostackp := $(call cc-option, -fno-stack-protector)
13CFLAGS_common.o := $(nostackp) 13CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o addon_cpuid_features.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o sched.o mshyperv.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18 18
19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o
20obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
21 21
22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..ba5f62f45f01 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
466 } 466 }
467 467
468 } 468 }
469 if (c->x86 == 0x10 || c->x86 == 0x11) 469 if (c->x86 >= 0x10)
470 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 470 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
471 471
472 /* get apicid instead of initial apic id from cpuid */ 472 /* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
529 num_cache_leaves = 3; 529 num_cache_leaves = 3;
530 } 530 }
531 531
532 if (c->x86 >= 0xf && c->x86 <= 0x11) 532 if (c->x86 >= 0xf)
533 set_cpu_cap(c, X86_FEATURE_K8); 533 set_cpu_cap(c, X86_FEATURE_K8);
534 534
535 if (cpu_has_xmm2) { 535 if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
546 fam10h_check_enable_mmcfg(); 546 fam10h_check_enable_mmcfg();
547 } 547 }
548 548
549 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { 549 if (c == &boot_cpu_data && c->x86 >= 0xf) {
550 unsigned long long tseg; 550 unsigned long long tseg;
551 551
552 /* 552 /*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
609}; 609};
610 610
611cpu_dev_register(amd_cpu_dev); 611cpu_dev_register(amd_cpu_dev);
612
613/*
614 * AMD errata checking
615 *
616 * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
617 * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
618 * have an OSVW id assigned, which it takes as first argument. Both take a
619 * variable number of family-specific model-stepping ranges created by
620 * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
621 * int[] in arch/x86/include/asm/processor.h.
622 *
623 * Example:
624 *
625 * const int amd_erratum_319[] =
626 * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
627 * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
628 * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
629 */
630
631const int amd_erratum_400[] =
632 AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
633 AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
634EXPORT_SYMBOL_GPL(amd_erratum_400);
635
636const int amd_erratum_383[] =
637 AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
638EXPORT_SYMBOL_GPL(amd_erratum_383);
639
640bool cpu_has_amd_erratum(const int *erratum)
641{
642 struct cpuinfo_x86 *cpu = &current_cpu_data;
643 int osvw_id = *erratum++;
644 u32 range;
645 u32 ms;
646
647 /*
648 * If called early enough that current_cpu_data hasn't been initialized
649 * yet, fall back to boot_cpu_data.
650 */
651 if (cpu->x86 == 0)
652 cpu = &boot_cpu_data;
653
654 if (cpu->x86_vendor != X86_VENDOR_AMD)
655 return false;
656
657 if (osvw_id >= 0 && osvw_id < 65536 &&
658 cpu_has(cpu, X86_FEATURE_OSVW)) {
659 u64 osvw_len;
660
661 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
662 if (osvw_id < osvw_len) {
663 u64 osvw_bits;
664
665 rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
666 osvw_bits);
667 return osvw_bits & (1ULL << (osvw_id & 0x3f));
668 }
669 }
670
671 /* OSVW unavailable or ID unknown, match family-model-stepping range */
672 ms = (cpu->x86_model << 4) | cpu->x86_mask;
673 while ((range = *erratum++))
674 if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
675 (ms >= AMD_MODEL_RANGE_START(range)) &&
676 (ms <= AMD_MODEL_RANGE_END(range)))
677 return true;
678
679 return false;
680}
681
682EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 68e4a6f2211e..f2f9ac7da25c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
140static int __init x86_xsave_setup(char *s) 140static int __init x86_xsave_setup(char *s)
141{ 141{
142 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 142 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
143 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
143 return 1; 144 return 1;
144} 145}
145__setup("noxsave", x86_xsave_setup); 146__setup("noxsave", x86_xsave_setup);
146 147
148static int __init x86_xsaveopt_setup(char *s)
149{
150 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
151 return 1;
152}
153__setup("noxsaveopt", x86_xsaveopt_setup);
154
147#ifdef CONFIG_X86_32 155#ifdef CONFIG_X86_32
148static int cachesize_override __cpuinitdata = -1; 156static int cachesize_override __cpuinitdata = -1;
149static int disable_x86_serial_nr __cpuinitdata = 1; 157static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -537,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
537 } 545 }
538} 546}
539 547
540static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) 548void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
541{ 549{
542 u32 tfms, xlvl; 550 u32 tfms, xlvl;
543 u32 ebx; 551 u32 ebx;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
551 c->x86_capability[4] = excap; 559 c->x86_capability[4] = excap;
552 } 560 }
553 561
562 /* Additional Intel-defined flags: level 0x00000007 */
563 if (c->cpuid_level >= 0x00000007) {
564 u32 eax, ebx, ecx, edx;
565
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567
568 if (eax > 0)
569 c->x86_capability[9] = ebx;
570 }
571
554 /* AMD-defined flags: level 0x80000001 */ 572 /* AMD-defined flags: level 0x80000001 */
555 xlvl = cpuid_eax(0x80000000); 573 xlvl = cpuid_eax(0x80000000);
556 c->extended_cpuid_level = xlvl; 574 c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
576 if (c->extended_cpuid_level >= 0x80000007) 594 if (c->extended_cpuid_level >= 0x80000007)
577 c->x86_power = cpuid_edx(0x80000007); 595 c->x86_power = cpuid_edx(0x80000007);
578 596
597 init_scattered_cpuid_features(c);
579} 598}
580 599
581static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) 600static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
731 750
732 get_model_name(c); /* Default name */ 751 get_model_name(c); /* Default name */
733 752
734 init_scattered_cpuid_features(c);
735 detect_nopl(c); 753 detect_nopl(c);
736} 754}
737 755
@@ -1192,6 +1210,7 @@ void __cpuinit cpu_init(void)
1192 dbg_restore_debug_regs(); 1210 dbg_restore_debug_regs();
1193 1211
1194 fpu_init(); 1212 fpu_init();
1213 xsave_init();
1195 1214
1196 raw_local_save_flags(kernel_eflags); 1215 raw_local_save_flags(kernel_eflags);
1197 1216
@@ -1252,12 +1271,7 @@ void __cpuinit cpu_init(void)
1252 clear_used_math(); 1271 clear_used_math();
1253 mxcsr_feature_mask_init(); 1272 mxcsr_feature_mask_init();
1254 1273
1255 /* 1274 fpu_init();
1256 * Boot processor to setup the FP and extended state context info.
1257 */
1258 if (smp_processor_id() == boot_cpu_id)
1259 init_thread_xstate();
1260
1261 xsave_init(); 1275 xsave_init();
1262} 1276}
1263#endif 1277#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 3624e8a0f71b..f668bb1f7d43 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -33,5 +33,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36extern void get_cpu_cap(struct cpuinfo_x86 *c);
36 37
37#endif 38#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 246cd3afbb5f..cd8da247dda1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -72,7 +72,7 @@ struct acpi_cpufreq_data {
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); 72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73 73
74/* acpi_perf_data is a pointer to percpu data. */ 74/* acpi_perf_data is a pointer to percpu data. */
75static struct acpi_processor_performance *acpi_perf_data; 75static struct acpi_processor_performance __percpu *acpi_perf_data;
76 76
77static struct cpufreq_driver acpi_cpufreq_driver; 77static struct cpufreq_driver acpi_cpufreq_driver;
78 78
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index a36de5bbb622..4f6f679f2799 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -110,7 +110,7 @@ struct pcc_cpu {
110 u32 output_offset; 110 u32 output_offset;
111}; 111};
112 112
113static struct pcc_cpu *pcc_cpu_info; 113static struct pcc_cpu __percpu *pcc_cpu_info;
114 114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy) 115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{ 116{
@@ -368,16 +368,22 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
368 return -ENODEV; 368 return -ENODEV;
369 369
370 out_obj = output.pointer; 370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) 371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 return -ENODEV; 372 ret = -ENODEV;
373 goto out_free;
374 }
373 375
374 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); 376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
375 if (errors) 377 if (errors) {
376 return -ENODEV; 378 ret = -ENODEV;
379 goto out_free;
380 }
377 381
378 supported = *((u32 *)(out_obj->buffer.pointer + 4)); 382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
379 if (!(supported & 0x1)) 383 if (!(supported & 0x1)) {
380 return -ENODEV; 384 ret = -ENODEV;
385 goto out_free;
386 }
381 387
382out_free: 388out_free:
383 kfree(output.pointer); 389 kfree(output.pointer);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index dd531cc56a8f..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,6 +34,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
34{ 34{
35 &x86_hyper_vmware, 35 &x86_hyper_vmware,
36 &x86_hyper_ms_hyperv, 36 &x86_hyper_ms_hyperv,
37#ifdef CONFIG_XEN_PVHVM
38 &x86_hyper_xen_hvm,
39#endif
37}; 40};
38 41
39const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 85f69cdeae10..b4389441efbb 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
39 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; 39 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
40 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 40 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
41 c->cpuid_level = cpuid_eax(0); 41 c->cpuid_level = cpuid_eax(0);
42 get_cpu_cap(c);
42 } 43 }
43 } 44 }
44 45
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 33eae2062cf5..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -347,8 +347,8 @@ static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
347 return l3; 347 return l3;
348} 348}
349 349
350static void __cpuinit 350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
351amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 351 int index)
352{ 352{
353 int node; 353 int node;
354 354
@@ -396,20 +396,39 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
396 this_leaf->l3 = l3_caches[node]; 396 this_leaf->l3 = l3_caches[node];
397} 397}
398 398
399/*
400 * check whether a slot used for disabling an L3 index is occupied.
401 * @l3: L3 cache descriptor
402 * @slot: slot number (0..1)
403 *
404 * @returns: the disabled index if used or negative value if slot free.
405 */
406int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
407{
408 unsigned int reg = 0;
409
410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
411
412 /* check whether this slot is activated already */
413 if (reg & (3UL << 30))
414 return reg & 0xfff;
415
416 return -1;
417}
418
399static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, 419static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
400 unsigned int slot) 420 unsigned int slot)
401{ 421{
402 struct pci_dev *dev = this_leaf->l3->dev; 422 int index;
403 unsigned int reg = 0;
404 423
405 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 424 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
406 return -EINVAL; 425 return -EINVAL;
407 426
408 if (!dev) 427 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
409 return -EINVAL; 428 if (index >= 0)
429 return sprintf(buf, "%d\n", index);
410 430
411 pci_read_config_dword(dev, 0x1BC + slot * 4, &reg); 431 return sprintf(buf, "FREE\n");
412 return sprintf(buf, "0x%08x\n", reg);
413} 432}
414 433
415#define SHOW_CACHE_DISABLE(slot) \ 434#define SHOW_CACHE_DISABLE(slot) \
@@ -451,37 +470,74 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
451 } 470 }
452} 471}
453 472
454 473/*
455static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, 474 * disable a L3 cache index by using a disable-slot
456 const char *buf, size_t count, 475 *
457 unsigned int slot) 476 * @l3: L3 cache descriptor
477 * @cpu: A CPU on the node containing the L3 cache
478 * @slot: slot number (0..1)
479 * @index: index to disable
480 *
481 * @return: 0 on success, error status on failure
482 */
483int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
484 unsigned long index)
458{ 485{
459 struct pci_dev *dev = this_leaf->l3->dev; 486 int ret = 0;
460 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
461 unsigned long val = 0;
462 487
463#define SUBCACHE_MASK (3UL << 20) 488#define SUBCACHE_MASK (3UL << 20)
464#define SUBCACHE_INDEX 0xfff 489#define SUBCACHE_INDEX 0xfff
465 490
466 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0)
467 return -EINVAL; 497 return -EINVAL;
468 498
499 /*
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL;
505
506 /* do not allow writes outside of allowed bits */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL;
510
511 amd_l3_disable_index(l3, cpu, slot, index);
512
513 return 0;
514}
515
516static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
517 const char *buf, size_t count,
518 unsigned int slot)
519{
520 unsigned long val = 0;
521 int cpu, err = 0;
522
469 if (!capable(CAP_SYS_ADMIN)) 523 if (!capable(CAP_SYS_ADMIN))
470 return -EPERM; 524 return -EPERM;
471 525
472 if (!dev) 526 if (!this_leaf->l3 || !this_leaf->l3->can_disable)
473 return -EINVAL; 527 return -EINVAL;
474 528
475 if (strict_strtoul(buf, 10, &val) < 0) 529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
476 return -EINVAL;
477 530
478 /* do not allow writes outside of allowed bits */ 531 if (strict_strtoul(buf, 10, &val) < 0)
479 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
480 ((val & SUBCACHE_INDEX) > this_leaf->l3->indices))
481 return -EINVAL; 532 return -EINVAL;
482 533
483 amd_l3_disable_index(this_leaf->l3, cpu, slot, val); 534 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
484 535 if (err) {
536 if (err == -EEXIST)
537 printk(KERN_WARNING "L3 disable slot %d in use!\n",
538 slot);
539 return err;
540 }
485 return count; 541 return count;
486} 542}
487 543
@@ -502,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
502 558
503#else /* CONFIG_CPU_SUP_AMD */ 559#else /* CONFIG_CPU_SUP_AMD */
504static void __cpuinit 560static void __cpuinit
505amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
506{ 562{
507}; 563};
508#endif /* CONFIG_CPU_SUP_AMD */ 564#endif /* CONFIG_CPU_SUP_AMD */
@@ -518,7 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
518 574
519 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
520 amd_cpuid4(index, &eax, &ebx, &ecx); 576 amd_cpuid4(index, &eax, &ebx, &ecx);
521 amd_check_l3_disable(index, this_leaf); 577 amd_check_l3_disable(this_leaf, index);
522 } else { 578 } else {
523 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
524 } 580 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 745b54f9be89..8209472b27a5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -80,7 +80,7 @@ int apei_write_mce(struct mce *m)
80 rcd.hdr.revision = CPER_RECORD_REV; 80 rcd.hdr.revision = CPER_RECORD_REV;
81 rcd.hdr.signature_end = CPER_SIG_END; 81 rcd.hdr.signature_end = CPER_SIG_END;
82 rcd.hdr.section_count = 1; 82 rcd.hdr.section_count = 1;
83 rcd.hdr.error_severity = CPER_SER_FATAL; 83 rcd.hdr.error_severity = CPER_SEV_FATAL;
84 /* timestamp, platform_id, partition_id are all invalid */ 84 /* timestamp, platform_id, partition_id are all invalid */
85 rcd.hdr.validation_bits = 0; 85 rcd.hdr.validation_bits = 0;
86 rcd.hdr.record_length = sizeof(rcd); 86 rcd.hdr.record_length = sizeof(rcd);
@@ -96,7 +96,7 @@ int apei_write_mce(struct mce *m)
96 rcd.sec_hdr.validation_bits = 0; 96 rcd.sec_hdr.validation_bits = 0;
97 rcd.sec_hdr.flags = CPER_SEC_PRIMARY; 97 rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
98 rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; 98 rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
99 rcd.sec_hdr.section_severity = CPER_SER_FATAL; 99 rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
100 100
101 memcpy(&rcd.mce, m, sizeof(*m)); 101 memcpy(&rcd.mce, m, sizeof(*m));
102 102
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1970ef911c99..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -51,7 +51,7 @@
51static DEFINE_MUTEX(mce_read_mutex); 51static DEFINE_MUTEX(mce_read_mutex);
52 52
53#define rcu_dereference_check_mce(p) \ 53#define rcu_dereference_check_mce(p) \
54 rcu_dereference_check((p), \ 54 rcu_dereference_index_check((p), \
55 rcu_read_lock_sched_held() || \ 55 rcu_read_lock_sched_held() || \
56 lockdep_is_held(&mce_read_mutex)) 56 lockdep_is_held(&mce_read_mutex))
57 57
@@ -107,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
107static int default_decode_mce(struct notifier_block *nb, unsigned long val, 107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
108 void *data) 108 void *data)
109{ 109{
110 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
111 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
112 112
113 return NOTIFY_STOP; 113 return NOTIFY_STOP;
114} 114}
@@ -211,11 +211,11 @@ void mce_log(struct mce *mce)
211 211
212static void print_mce(struct mce *m) 212static void print_mce(struct mce *m)
213{ 213{
214 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
215 m->extcpu, m->mcgstatus, m->bank, m->status); 215 m->extcpu, m->mcgstatus, m->bank, m->status);
216 216
217 if (m->ip) { 217 if (m->ip) {
218 pr_emerg("RIP%s %02x:<%016Lx> ", 218 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 219 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
220 m->cs, m->ip); 220 m->cs, m->ip);
221 221
@@ -224,14 +224,14 @@ static void print_mce(struct mce *m)
224 pr_cont("\n"); 224 pr_cont("\n");
225 } 225 }
226 226
227 pr_emerg("TSC %llx ", m->tsc); 227 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
228 if (m->addr) 228 if (m->addr)
229 pr_cont("ADDR %llx ", m->addr); 229 pr_cont("ADDR %llx ", m->addr);
230 if (m->misc) 230 if (m->misc)
231 pr_cont("MISC %llx ", m->misc); 231 pr_cont("MISC %llx ", m->misc);
232 232
233 pr_cont("\n"); 233 pr_cont("\n");
234 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", 234 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); 235 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
236 236
237 /* 237 /*
@@ -241,16 +241,6 @@ static void print_mce(struct mce *m)
241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
242} 242}
243 243
244static void print_mce_head(void)
245{
246 pr_emerg("\nHARDWARE ERROR\n");
247}
248
249static void print_mce_tail(void)
250{
251 pr_emerg("This is not a software problem!\n");
252}
253
254#define PANIC_TIMEOUT 5 /* 5 seconds */ 244#define PANIC_TIMEOUT 5 /* 5 seconds */
255 245
256static atomic_t mce_paniced; 246static atomic_t mce_paniced;
@@ -291,7 +281,6 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
291 if (atomic_inc_return(&mce_fake_paniced) > 1) 281 if (atomic_inc_return(&mce_fake_paniced) > 1)
292 return; 282 return;
293 } 283 }
294 print_mce_head();
295 /* First print corrected ones that are still unlogged */ 284 /* First print corrected ones that are still unlogged */
296 for (i = 0; i < MCE_LOG_LEN; i++) { 285 for (i = 0; i < MCE_LOG_LEN; i++) {
297 struct mce *m = &mcelog.entry[i]; 286 struct mce *m = &mcelog.entry[i];
@@ -322,16 +311,15 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
322 apei_err = apei_write_mce(final); 311 apei_err = apei_write_mce(final);
323 } 312 }
324 if (cpu_missing) 313 if (cpu_missing)
325 printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); 314 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
326 print_mce_tail();
327 if (exp) 315 if (exp)
328 printk(KERN_EMERG "Machine check: %s\n", exp); 316 pr_emerg(HW_ERR "Machine check: %s\n", exp);
329 if (!fake_panic) { 317 if (!fake_panic) {
330 if (panic_timeout == 0) 318 if (panic_timeout == 0)
331 panic_timeout = mce_panic_timeout; 319 panic_timeout = mce_panic_timeout;
332 panic(msg); 320 panic(msg);
333 } else 321 } else
334 printk(KERN_EMERG "Fake kernel panic: %s\n", msg); 322 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
335} 323}
336 324
337/* Support code for software error injection */ 325/* Support code for software error injection */
@@ -1221,7 +1209,7 @@ int mce_notify_irq(void)
1221 schedule_work(&mce_trigger_work); 1209 schedule_work(&mce_trigger_work);
1222 1210
1223 if (__ratelimit(&ratelimit)) 1211 if (__ratelimit(&ratelimit))
1224 printk(KERN_INFO "Machine check events logged\n"); 1212 pr_info(HW_ERR "Machine check events logged\n");
1225 1213
1226 return 1; 1214 return 1;
1227 } 1215 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 224392d8fe8c..5e975298fa81 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -530,7 +530,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
530 err = -ENOMEM; 530 err = -ENOMEM;
531 goto out; 531 goto out;
532 } 532 }
533 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { 533 if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
534 kfree(b); 534 kfree(b);
535 err = -ENOMEM; 535 err = -ENOMEM;
536 goto out; 536 goto out;
@@ -543,7 +543,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
543#ifndef CONFIG_SMP 543#ifndef CONFIG_SMP
544 cpumask_setall(b->cpus); 544 cpumask_setall(b->cpus);
545#else 545#else
546 cpumask_copy(b->cpus, c->llc_shared_map); 546 cpumask_set_cpu(cpu, b->cpus);
547#endif 547#endif
548 548
549 per_cpu(threshold_banks, cpu)[bank] = b; 549 per_cpu(threshold_banks, cpu)[bank] = b;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
95 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 95 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
96 96
97 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
98 if (val & CMCI_EN) { 98 if (val & MCI_CTL2_CMCI_EN) {
99 if (test_and_clear_bit(i, owned) && !boot) 99 if (test_and_clear_bit(i, owned) && !boot)
100 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
101 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
102 continue; 102 continue;
103 } 103 }
104 104
105 val |= CMCI_EN | CMCI_THRESHOLD; 105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
106 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 107 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
107 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 108 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
108 109
109 /* Did the enable bit stick? -- the bank supports CMCI */ 110 /* Did the enable bit stick? -- the bank supports CMCI */
110 if (val & CMCI_EN) { 111 if (val & MCI_CTL2_CMCI_EN) {
111 if (!test_and_set_bit(i, owned) && !boot) 112 if (!test_and_set_bit(i, owned) && !boot)
112 print_update("CMCI", &hdr, i); 113 print_update("CMCI", &hdr, i);
113 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 114 __clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
155 continue; 156 continue;
156 /* Disable CMCI */ 157 /* Disable CMCI */
157 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 158 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
158 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
159 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 160 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
160 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 161 __clear_bit(i, __get_cpu_var(mce_banks_owned));
161 } 162 }
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index e1a0a3bf9716..d9368eeda309 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37#define THERMAL_THROTTLING_EVENT 0
38#define POWER_LIMIT_EVENT 1
39
37/* 40/*
38 * Current thermal throttling state: 41 * Current thermal event state:
39 */ 42 */
40struct thermal_state { 43struct _thermal_state {
41 bool is_throttled; 44 bool new_event;
42 45 int event;
43 u64 next_check; 46 u64 next_check;
44 unsigned long throttle_count; 47 unsigned long count;
45 unsigned long last_throttle_count; 48 unsigned long last_count;
49};
50
51struct thermal_state {
52 struct _thermal_state core_throttle;
53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit;
46}; 56};
47 57
48static DEFINE_PER_CPU(struct thermal_state, thermal_state); 58static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
53 63
54#ifdef CONFIG_SYSFS 64#ifdef CONFIG_SYSFS
55#define define_therm_throt_sysdev_one_ro(_name) \ 65#define define_therm_throt_sysdev_one_ro(_name) \
56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 66 static SYSDEV_ATTR(_name, 0444, \
67 therm_throt_sysdev_show_##_name, \
68 NULL) \
57 69
58#define define_therm_throt_sysdev_show_func(name) \ 70#define define_therm_throt_sysdev_show_func(event, name) \
59 \ 71 \
60static ssize_t therm_throt_sysdev_show_##name( \ 72static ssize_t therm_throt_sysdev_show_##event##_##name( \
61 struct sys_device *dev, \ 73 struct sys_device *dev, \
62 struct sysdev_attribute *attr, \ 74 struct sysdev_attribute *attr, \
63 char *buf) \ 75 char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
66 ssize_t ret; \ 78 ssize_t ret; \
67 \ 79 \
68 preempt_disable(); /* CPU hotplug */ \ 80 preempt_disable(); /* CPU hotplug */ \
69 if (cpu_online(cpu)) \ 81 if (cpu_online(cpu)) { \
70 ret = sprintf(buf, "%lu\n", \ 82 ret = sprintf(buf, "%lu\n", \
71 per_cpu(thermal_state, cpu).name); \ 83 per_cpu(thermal_state, cpu).event.name); \
72 else \ 84 } else \
73 ret = 0; \ 85 ret = 0; \
74 preempt_enable(); \ 86 preempt_enable(); \
75 \ 87 \
76 return ret; \ 88 return ret; \
77} 89}
78 90
79define_therm_throt_sysdev_show_func(throttle_count); 91define_therm_throt_sysdev_show_func(core_throttle, count);
80define_therm_throt_sysdev_one_ro(throttle_count); 92define_therm_throt_sysdev_one_ro(core_throttle_count);
93
94define_therm_throt_sysdev_show_func(core_power_limit, count);
95define_therm_throt_sysdev_one_ro(core_power_limit_count);
96
97define_therm_throt_sysdev_show_func(package_throttle, count);
98define_therm_throt_sysdev_one_ro(package_throttle_count);
99
100define_therm_throt_sysdev_show_func(package_power_limit, count);
101define_therm_throt_sysdev_one_ro(package_power_limit_count);
81 102
82static struct attribute *thermal_throttle_attrs[] = { 103static struct attribute *thermal_throttle_attrs[] = {
83 &attr_throttle_count.attr, 104 &attr_core_throttle_count.attr,
84 NULL 105 NULL
85}; 106};
86 107
87static struct attribute_group thermal_throttle_attr_group = { 108static struct attribute_group thermal_attr_group = {
88 .attrs = thermal_throttle_attrs, 109 .attrs = thermal_throttle_attrs,
89 .name = "thermal_throttle" 110 .name = "thermal_throttle"
90}; 111};
91#endif /* CONFIG_SYSFS */ 112#endif /* CONFIG_SYSFS */
92 113
114#define CORE_LEVEL 0
115#define PACKAGE_LEVEL 1
116
93/*** 117/***
94 * therm_throt_process - Process thermal throttling event from interrupt 118 * therm_throt_process - Process thermal throttling event from interrupt
95 * @curr: Whether the condition is current or not (boolean), since the 119 * @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
106 * 1 : Event should be logged further, and a message has been 130 * 1 : Event should be logged further, and a message has been
107 * printed to the syslog. 131 * printed to the syslog.
108 */ 132 */
109static int therm_throt_process(bool is_throttled) 133static int therm_throt_process(bool new_event, int event, int level)
110{ 134{
111 struct thermal_state *state; 135 struct _thermal_state *state;
112 unsigned int this_cpu; 136 unsigned int this_cpu = smp_processor_id();
113 bool was_throttled; 137 bool old_event;
114 u64 now; 138 u64 now;
139 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
115 140
116 this_cpu = smp_processor_id();
117 now = get_jiffies_64(); 141 now = get_jiffies_64();
118 state = &per_cpu(thermal_state, this_cpu); 142 if (level == CORE_LEVEL) {
143 if (event == THERMAL_THROTTLING_EVENT)
144 state = &pstate->core_throttle;
145 else if (event == POWER_LIMIT_EVENT)
146 state = &pstate->core_power_limit;
147 else
148 return 0;
149 } else if (level == PACKAGE_LEVEL) {
150 if (event == THERMAL_THROTTLING_EVENT)
151 state = &pstate->package_throttle;
152 else if (event == POWER_LIMIT_EVENT)
153 state = &pstate->package_power_limit;
154 else
155 return 0;
156 } else
157 return 0;
119 158
120 was_throttled = state->is_throttled; 159 old_event = state->new_event;
121 state->is_throttled = is_throttled; 160 state->new_event = new_event;
122 161
123 if (is_throttled) 162 if (new_event)
124 state->throttle_count++; 163 state->count++;
125 164
126 if (time_before64(now, state->next_check) && 165 if (time_before64(now, state->next_check) &&
127 state->throttle_count != state->last_throttle_count) 166 state->count != state->last_count)
128 return 0; 167 return 0;
129 168
130 state->next_check = now + CHECK_INTERVAL; 169 state->next_check = now + CHECK_INTERVAL;
131 state->last_throttle_count = state->throttle_count; 170 state->last_count = state->count;
132 171
133 /* if we just entered the thermal event */ 172 /* if we just entered the thermal event */
134 if (is_throttled) { 173 if (new_event) {
135 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); 174 if (event == THERMAL_THROTTLING_EVENT)
175 printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
176 this_cpu,
177 level == CORE_LEVEL ? "Core" : "Package",
178 state->count);
179 else
180 printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
181 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package",
183 state->count);
136 184
137 add_taint(TAINT_MACHINE_CHECK); 185 add_taint(TAINT_MACHINE_CHECK);
138 return 1; 186 return 1;
139 } 187 }
140 if (was_throttled) { 188 if (old_event) {
141 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); 189 if (event == THERMAL_THROTTLING_EVENT)
190 printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
191 this_cpu,
192 level == CORE_LEVEL ? "Core" : "Package");
193 else
194 printk(KERN_INFO "CPU%d: %s power limit normal\n",
195 this_cpu,
196 level == CORE_LEVEL ? "Core" : "Package");
142 return 1; 197 return 1;
143 } 198 }
144 199
@@ -147,15 +202,35 @@ static int therm_throt_process(bool is_throttled)
147 202
148#ifdef CONFIG_SYSFS 203#ifdef CONFIG_SYSFS
149/* Add/Remove thermal_throttle interface for CPU device: */ 204/* Add/Remove thermal_throttle interface for CPU device: */
150static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) 205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
206 unsigned int cpu)
151{ 207{
152 return sysfs_create_group(&sys_dev->kobj, 208 int err;
153 &thermal_throttle_attr_group); 209 struct cpuinfo_x86 *c = &cpu_data(cpu);
210
211 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
212 if (err)
213 return err;
214
215 if (cpu_has(c, X86_FEATURE_PLN))
216 err = sysfs_add_file_to_group(&sys_dev->kobj,
217 &attr_core_power_limit_count.attr,
218 thermal_attr_group.name);
219 if (cpu_has(c, X86_FEATURE_PTS))
220 err = sysfs_add_file_to_group(&sys_dev->kobj,
221 &attr_package_throttle_count.attr,
222 thermal_attr_group.name);
223 if (cpu_has(c, X86_FEATURE_PLN))
224 err = sysfs_add_file_to_group(&sys_dev->kobj,
225 &attr_package_power_limit_count.attr,
226 thermal_attr_group.name);
227
228 return err;
154} 229}
155 230
156static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 231static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
157{ 232{
158 sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 233 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
159} 234}
160 235
161/* Mutex protecting device creation against CPU hotplug: */ 236/* Mutex protecting device creation against CPU hotplug: */
@@ -177,7 +252,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
177 case CPU_UP_PREPARE: 252 case CPU_UP_PREPARE:
178 case CPU_UP_PREPARE_FROZEN: 253 case CPU_UP_PREPARE_FROZEN:
179 mutex_lock(&therm_cpu_lock); 254 mutex_lock(&therm_cpu_lock);
180 err = thermal_throttle_add_dev(sys_dev); 255 err = thermal_throttle_add_dev(sys_dev, cpu);
181 mutex_unlock(&therm_cpu_lock); 256 mutex_unlock(&therm_cpu_lock);
182 WARN_ON(err); 257 WARN_ON(err);
183 break; 258 break;
@@ -213,7 +288,7 @@ static __init int thermal_throttle_init_device(void)
213#endif 288#endif
214 /* connect live CPUs to sysfs */ 289 /* connect live CPUs to sysfs */
215 for_each_online_cpu(cpu) { 290 for_each_online_cpu(cpu) {
216 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); 291 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
217 WARN_ON(err); 292 WARN_ON(err);
218 } 293 }
219#ifdef CONFIG_HOTPLUG_CPU 294#ifdef CONFIG_HOTPLUG_CPU
@@ -226,14 +301,50 @@ device_initcall(thermal_throttle_init_device);
226 301
227#endif /* CONFIG_SYSFS */ 302#endif /* CONFIG_SYSFS */
228 303
304/*
305 * Set up the most two significant bit to notify mce log that this thermal
306 * event type.
307 * This is a temp solution. May be changed in the future with mce log
308 * infrasture.
309 */
310#define CORE_THROTTLED (0)
311#define CORE_POWER_LIMIT ((__u64)1 << 62)
312#define PACKAGE_THROTTLED ((__u64)2 << 62)
313#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
314
229/* Thermal transition interrupt handler */ 315/* Thermal transition interrupt handler */
230static void intel_thermal_interrupt(void) 316static void intel_thermal_interrupt(void)
231{ 317{
232 __u64 msr_val; 318 __u64 msr_val;
319 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
233 320
234 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 321 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
235 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) 322
236 mce_log_therm_throt_event(msr_val); 323 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
324 THERMAL_THROTTLING_EVENT,
325 CORE_LEVEL) != 0)
326 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
327
328 if (cpu_has(c, X86_FEATURE_PLN))
329 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
330 POWER_LIMIT_EVENT,
331 CORE_LEVEL) != 0)
332 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
333
334 if (cpu_has(c, X86_FEATURE_PTS)) {
335 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
336 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
337 THERMAL_THROTTLING_EVENT,
338 PACKAGE_LEVEL) != 0)
339 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
340 if (cpu_has(c, X86_FEATURE_PLN))
341 if (therm_throt_process(msr_val &
342 PACKAGE_THERM_STATUS_POWER_LIMIT,
343 POWER_LIMIT_EVENT,
344 PACKAGE_LEVEL) != 0)
345 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
346 | msr_val);
347 }
237} 348}
238 349
239static void unexpected_thermal_interrupt(void) 350static void unexpected_thermal_interrupt(void)
@@ -335,8 +446,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
335 apic_write(APIC_LVTTHMR, h); 446 apic_write(APIC_LVTTHMR, h);
336 447
337 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 448 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
338 wrmsr(MSR_IA32_THERM_INTERRUPT, 449 if (cpu_has(c, X86_FEATURE_PLN))
339 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); 450 wrmsr(MSR_IA32_THERM_INTERRUPT,
451 l | (THERM_INT_LOW_ENABLE
452 | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
453 else
454 wrmsr(MSR_IA32_THERM_INTERRUPT,
455 l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
456
457 if (cpu_has(c, X86_FEATURE_PTS)) {
458 rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
459 if (cpu_has(c, X86_FEATURE_PLN))
460 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
461 l | (PACKAGE_THERM_INT_LOW_ENABLE
462 | PACKAGE_THERM_INT_HIGH_ENABLE
463 | PACKAGE_THERM_INT_PLN_ENABLE), h);
464 else
465 wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
466 l | (PACKAGE_THERM_INT_LOW_ENABLE
467 | PACKAGE_THERM_INT_HIGH_ENABLE), h);
468 }
340 469
341 smp_thermal_vector = intel_thermal_interrupt; 470 smp_thermal_vector = intel_thermal_interrupt;
342 471
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 16f41bbe46b6..d944bf6c50e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
18#include <asm/mshyperv.h> 18#include <asm/mshyperv.h>
19 19
20struct ms_hyperv_info ms_hyperv; 20struct ms_hyperv_info ms_hyperv;
21EXPORT_SYMBOL_GPL(ms_hyperv);
21 22
22static bool __init ms_hyperv_platform(void) 23static bool __init ms_hyperv_platform(void)
23{ 24{
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
632 unsigned long gran_base, chunk_base, lose_base; 632 unsigned long gran_base, chunk_base, lose_base;
633 char gran_factor, chunk_factor, lose_factor; 633 char gran_factor, chunk_factor, lose_factor;
634 634
635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 635 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 636 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), 637 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
638 638
639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", 639 pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
640 result[i].bad ? "*BAD*" : " ", 640 result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
433{ 433{
434 unsigned int mask_lo, mask_hi, base_lo, base_hi; 434 unsigned int mask_lo, mask_hi, base_lo, base_hi;
435 unsigned int tmp, hi; 435 unsigned int tmp, hi;
436 int cpu;
437 436
438 /* 437 /*
439 * get_mtrr doesn't need to update mtrr_state, also it could be called 438 * get_mtrr doesn't need to update mtrr_state, also it could be called
440 * from any cpu, so try to print it out directly. 439 * from any cpu, so try to print it out directly.
441 */ 440 */
442 cpu = get_cpu(); 441 get_cpu();
443 442
444 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 443 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
445 444
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
35 35
36#include <linux/types.h> /* FIXME: kvm_para.h needs this */ 36#include <linux/types.h> /* FIXME: kvm_para.h needs this */
37 37
38#include <linux/stop_machine.h>
38#include <linux/kvm_para.h> 39#include <linux/kvm_para.h>
39#include <linux/uaccess.h> 40#include <linux/uaccess.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
143 mtrr_type smp_type; 144 mtrr_type smp_type;
144}; 145};
145 146
147static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
148
146/** 149/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 150 * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data 151 * @info: pointer to mtrr configuration data
149 * 152 *
150 * Returns nothing. 153 * Returns nothing.
151 */ 154 */
152static void ipi_handler(void *info) 155static int mtrr_work_handler(void *info)
153{ 156{
154#ifdef CONFIG_SMP 157#ifdef CONFIG_SMP
155 struct set_mtrr_data *data = info; 158 struct set_mtrr_data *data = info;
156 unsigned long flags; 159 unsigned long flags;
157 160
161 atomic_dec(&data->count);
162 while (!atomic_read(&data->gate))
163 cpu_relax();
164
158 local_irq_save(flags); 165 local_irq_save(flags);
159 166
160 atomic_dec(&data->count); 167 atomic_dec(&data->count);
161 while (!atomic_read(&data->gate)) 168 while (atomic_read(&data->gate))
162 cpu_relax(); 169 cpu_relax();
163 170
164 /* The master has cleared me to execute */ 171 /* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
173 } 180 }
174 181
175 atomic_dec(&data->count); 182 atomic_dec(&data->count);
176 while (atomic_read(&data->gate)) 183 while (!atomic_read(&data->gate))
177 cpu_relax(); 184 cpu_relax();
178 185
179 atomic_dec(&data->count); 186 atomic_dec(&data->count);
180 local_irq_restore(flags); 187 local_irq_restore(flags);
181#endif 188#endif
189 return 0;
182} 190}
183 191
184static inline int types_compatible(mtrr_type type1, mtrr_type type2) 192static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
198 * 206 *
199 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: 207 * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
200 * 208 *
201 * 1. Send IPI to do the following: 209 * 1. Queue work to do the following on all processors:
202 * 2. Disable Interrupts 210 * 2. Disable Interrupts
203 * 3. Wait for all procs to do so 211 * 3. Wait for all procs to do so
204 * 4. Enter no-fill cache mode 212 * 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
215 * 15. Enable interrupts. 223 * 15. Enable interrupts.
216 * 224 *
217 * What does that mean for us? Well, first we set data.count to the number 225 * What does that mean for us? Well, first we set data.count to the number
218 * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait 226 * of CPUs. As each CPU announces that it started the rendezvous handler by
219 * until it hits 0 and proceed. We set the data.gate flag and reset data.count. 227 * decrementing the count, We reset data.count and set the data.gate flag
220 * Meanwhile, they are waiting for that flag to be set. Once it's set, each 228 * allowing all the cpu's to proceed with the work. As each cpu disables
229 * interrupts, it'll decrement data.count once. We wait until it hits 0 and
230 * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
231 * are waiting for that flag to be cleared. Once it's cleared, each
221 * CPU goes through the transition of updating MTRRs. 232 * CPU goes through the transition of updating MTRRs.
222 * The CPU vendors may each do it differently, 233 * The CPU vendors may each do it differently,
223 * so we call mtrr_if->set() callback and let them take care of it. 234 * so we call mtrr_if->set() callback and let them take care of it.
224 * When they're done, they again decrement data->count and wait for data.gate 235 * When they're done, they again decrement data->count and wait for data.gate
225 * to be reset. 236 * to be set.
226 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag 237 * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
227 * Everyone then enables interrupts and we all continue on. 238 * Everyone then enables interrupts and we all continue on.
228 * 239 *
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
234{ 245{
235 struct set_mtrr_data data; 246 struct set_mtrr_data data;
236 unsigned long flags; 247 unsigned long flags;
248 int cpu;
249
250 preempt_disable();
237 251
238 data.smp_reg = reg; 252 data.smp_reg = reg;
239 data.smp_base = base; 253 data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
246 atomic_set(&data.gate, 0); 260 atomic_set(&data.gate, 0);
247 261
248 /* Start the ball rolling on other CPUs */ 262 /* Start the ball rolling on other CPUs */
249 if (smp_call_function(ipi_handler, &data, 0) != 0) 263 for_each_online_cpu(cpu) {
250 panic("mtrr: timed out waiting for other CPUs\n"); 264 struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
265
266 if (cpu == smp_processor_id())
267 continue;
268
269 stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
270 }
251 271
252 local_irq_save(flags);
253 272
254 while (atomic_read(&data.count)) 273 while (atomic_read(&data.count))
255 cpu_relax(); 274 cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
259 smp_wmb(); 278 smp_wmb();
260 atomic_set(&data.gate, 1); 279 atomic_set(&data.gate, 1);
261 280
281 local_irq_save(flags);
282
283 while (atomic_read(&data.count))
284 cpu_relax();
285
286 /* Ok, reset count and toggle gate */
287 atomic_set(&data.count, num_booting_cpus() - 1);
288 smp_wmb();
289 atomic_set(&data.gate, 0);
290
262 /* Do our MTRR business */ 291 /* Do our MTRR business */
263 292
264 /* 293 /*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
279 308
280 atomic_set(&data.count, num_booting_cpus() - 1); 309 atomic_set(&data.count, num_booting_cpus() - 1);
281 smp_wmb(); 310 smp_wmb();
282 atomic_set(&data.gate, 0); 311 atomic_set(&data.gate, 1);
283 312
284 /* 313 /*
285 * Wait here for everyone to have seen the gate change 314 * Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
289 cpu_relax(); 318 cpu_relax();
290 319
291 local_irq_restore(flags); 320 local_irq_restore(flags);
321 preempt_enable();
292} 322}
293 323
294/** 324/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5db5b7d65a18..03a5b0385ad6 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -102,6 +102,7 @@ struct cpu_hw_events {
102 */ 102 */
103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ 103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
105 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
105 int enabled; 106 int enabled;
106 107
107 int n_events; 108 int n_events;
@@ -220,6 +221,7 @@ struct x86_pmu {
220 struct perf_event *event); 221 struct perf_event *event);
221 struct event_constraint *event_constraints; 222 struct event_constraint *event_constraints;
222 void (*quirks)(void); 223 void (*quirks)(void);
224 int perfctr_second_write;
223 225
224 int (*cpu_prepare)(int cpu); 226 int (*cpu_prepare)(int cpu);
225 void (*cpu_starting)(int cpu); 227 void (*cpu_starting)(int cpu);
@@ -295,10 +297,10 @@ x86_perf_event_update(struct perf_event *event)
295 * count to the generic event atomically: 297 * count to the generic event atomically:
296 */ 298 */
297again: 299again:
298 prev_raw_count = atomic64_read(&hwc->prev_count); 300 prev_raw_count = local64_read(&hwc->prev_count);
299 rdmsrl(hwc->event_base + idx, new_raw_count); 301 rdmsrl(hwc->event_base + idx, new_raw_count);
300 302
301 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
302 new_raw_count) != prev_raw_count) 304 new_raw_count) != prev_raw_count)
303 goto again; 305 goto again;
304 306
@@ -313,8 +315,8 @@ again:
313 delta = (new_raw_count << shift) - (prev_raw_count << shift); 315 delta = (new_raw_count << shift) - (prev_raw_count << shift);
314 delta >>= shift; 316 delta >>= shift;
315 317
316 atomic64_add(delta, &event->count); 318 local64_add(delta, &event->count);
317 atomic64_sub(delta, &hwc->period_left); 319 local64_sub(delta, &hwc->period_left);
318 320
319 return new_raw_count; 321 return new_raw_count;
320} 322}
@@ -438,7 +440,7 @@ static int x86_setup_perfctr(struct perf_event *event)
438 if (!hwc->sample_period) { 440 if (!hwc->sample_period) {
439 hwc->sample_period = x86_pmu.max_period; 441 hwc->sample_period = x86_pmu.max_period;
440 hwc->last_period = hwc->sample_period; 442 hwc->last_period = hwc->sample_period;
441 atomic64_set(&hwc->period_left, hwc->sample_period); 443 local64_set(&hwc->period_left, hwc->sample_period);
442 } else { 444 } else {
443 /* 445 /*
444 * If we have a PMU initialized but no APIC 446 * If we have a PMU initialized but no APIC
@@ -885,7 +887,7 @@ static int
885x86_perf_event_set_period(struct perf_event *event) 887x86_perf_event_set_period(struct perf_event *event)
886{ 888{
887 struct hw_perf_event *hwc = &event->hw; 889 struct hw_perf_event *hwc = &event->hw;
888 s64 left = atomic64_read(&hwc->period_left); 890 s64 left = local64_read(&hwc->period_left);
889 s64 period = hwc->sample_period; 891 s64 period = hwc->sample_period;
890 int ret = 0, idx = hwc->idx; 892 int ret = 0, idx = hwc->idx;
891 893
@@ -897,14 +899,14 @@ x86_perf_event_set_period(struct perf_event *event)
897 */ 899 */
898 if (unlikely(left <= -period)) { 900 if (unlikely(left <= -period)) {
899 left = period; 901 left = period;
900 atomic64_set(&hwc->period_left, left); 902 local64_set(&hwc->period_left, left);
901 hwc->last_period = period; 903 hwc->last_period = period;
902 ret = 1; 904 ret = 1;
903 } 905 }
904 906
905 if (unlikely(left <= 0)) { 907 if (unlikely(left <= 0)) {
906 left += period; 908 left += period;
907 atomic64_set(&hwc->period_left, left); 909 local64_set(&hwc->period_left, left);
908 hwc->last_period = period; 910 hwc->last_period = period;
909 ret = 1; 911 ret = 1;
910 } 912 }
@@ -923,10 +925,19 @@ x86_perf_event_set_period(struct perf_event *event)
923 * The hw event starts counting from this event offset, 925 * The hw event starts counting from this event offset,
924 * mark it to be able to extra future deltas: 926 * mark it to be able to extra future deltas:
925 */ 927 */
926 atomic64_set(&hwc->prev_count, (u64)-left); 928 local64_set(&hwc->prev_count, (u64)-left);
927 929
928 wrmsrl(hwc->event_base + idx, 930 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
931
932 /*
933 * Due to erratum on certan cpu we need
934 * a second write to be sure the register
935 * is updated properly
936 */
937 if (x86_pmu.perfctr_second_write) {
938 wrmsrl(hwc->event_base + idx,
929 (u64)(-left) & x86_pmu.cntval_mask); 939 (u64)(-left) & x86_pmu.cntval_mask);
940 }
930 941
931 perf_event_update_userpage(event); 942 perf_event_update_userpage(event);
932 943
@@ -969,7 +980,7 @@ static int x86_pmu_enable(struct perf_event *event)
969 * skip the schedulability test here, it will be peformed 980 * skip the schedulability test here, it will be peformed
970 * at commit time(->commit_txn) as a whole 981 * at commit time(->commit_txn) as a whole
971 */ 982 */
972 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) 983 if (cpuc->group_flag & PERF_EVENT_TXN)
973 goto out; 984 goto out;
974 985
975 ret = x86_pmu.schedule_events(cpuc, n, assign); 986 ret = x86_pmu.schedule_events(cpuc, n, assign);
@@ -1000,6 +1011,7 @@ static int x86_pmu_start(struct perf_event *event)
1000 x86_perf_event_set_period(event); 1011 x86_perf_event_set_period(event);
1001 cpuc->events[idx] = event; 1012 cpuc->events[idx] = event;
1002 __set_bit(idx, cpuc->active_mask); 1013 __set_bit(idx, cpuc->active_mask);
1014 __set_bit(idx, cpuc->running);
1003 x86_pmu.enable(event); 1015 x86_pmu.enable(event);
1004 perf_event_update_userpage(event); 1016 perf_event_update_userpage(event);
1005 1017
@@ -1096,7 +1108,7 @@ static void x86_pmu_disable(struct perf_event *event)
1096 * The events never got scheduled and ->cancel_txn will truncate 1108 * The events never got scheduled and ->cancel_txn will truncate
1097 * the event_list. 1109 * the event_list.
1098 */ 1110 */
1099 if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) 1111 if (cpuc->group_flag & PERF_EVENT_TXN)
1100 return; 1112 return;
1101 1113
1102 x86_pmu_stop(event); 1114 x86_pmu_stop(event);
@@ -1131,8 +1143,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1131 cpuc = &__get_cpu_var(cpu_hw_events); 1143 cpuc = &__get_cpu_var(cpu_hw_events);
1132 1144
1133 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1145 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1134 if (!test_bit(idx, cpuc->active_mask)) 1146 if (!test_bit(idx, cpuc->active_mask)) {
1147 /*
1148 * Though we deactivated the counter some cpus
1149 * might still deliver spurious interrupts still
1150 * in flight. Catch them:
1151 */
1152 if (__test_and_clear_bit(idx, cpuc->running))
1153 handled++;
1135 continue; 1154 continue;
1155 }
1136 1156
1137 event = cpuc->events[idx]; 1157 event = cpuc->events[idx];
1138 hwc = &event->hw; 1158 hwc = &event->hw;
@@ -1144,7 +1164,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1144 /* 1164 /*
1145 * event overflow 1165 * event overflow
1146 */ 1166 */
1147 handled = 1; 1167 handled++;
1148 data.period = event->hw.last_period; 1168 data.period = event->hw.last_period;
1149 1169
1150 if (!x86_perf_event_set_period(event)) 1170 if (!x86_perf_event_set_period(event))
@@ -1190,12 +1210,20 @@ void perf_events_lapic_init(void)
1190 apic_write(APIC_LVTPC, APIC_DM_NMI); 1210 apic_write(APIC_LVTPC, APIC_DM_NMI);
1191} 1211}
1192 1212
1213struct pmu_nmi_state {
1214 unsigned int marked;
1215 int handled;
1216};
1217
1218static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1219
1193static int __kprobes 1220static int __kprobes
1194perf_event_nmi_handler(struct notifier_block *self, 1221perf_event_nmi_handler(struct notifier_block *self,
1195 unsigned long cmd, void *__args) 1222 unsigned long cmd, void *__args)
1196{ 1223{
1197 struct die_args *args = __args; 1224 struct die_args *args = __args;
1198 struct pt_regs *regs; 1225 unsigned int this_nmi;
1226 int handled;
1199 1227
1200 if (!atomic_read(&active_events)) 1228 if (!atomic_read(&active_events))
1201 return NOTIFY_DONE; 1229 return NOTIFY_DONE;
@@ -1204,22 +1232,47 @@ perf_event_nmi_handler(struct notifier_block *self,
1204 case DIE_NMI: 1232 case DIE_NMI:
1205 case DIE_NMI_IPI: 1233 case DIE_NMI_IPI:
1206 break; 1234 break;
1207 1235 case DIE_NMIUNKNOWN:
1236 this_nmi = percpu_read(irq_stat.__nmi_count);
1237 if (this_nmi != __get_cpu_var(pmu_nmi).marked)
1238 /* let the kernel handle the unknown nmi */
1239 return NOTIFY_DONE;
1240 /*
1241 * This one is a PMU back-to-back nmi. Two events
1242 * trigger 'simultaneously' raising two back-to-back
1243 * NMIs. If the first NMI handles both, the latter
1244 * will be empty and daze the CPU. So, we drop it to
1245 * avoid false-positive 'unknown nmi' messages.
1246 */
1247 return NOTIFY_STOP;
1208 default: 1248 default:
1209 return NOTIFY_DONE; 1249 return NOTIFY_DONE;
1210 } 1250 }
1211 1251
1212 regs = args->regs;
1213
1214 apic_write(APIC_LVTPC, APIC_DM_NMI); 1252 apic_write(APIC_LVTPC, APIC_DM_NMI);
1215 /* 1253
1216 * Can't rely on the handled return value to say it was our NMI, two 1254 handled = x86_pmu.handle_irq(args->regs);
1217 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1255 if (!handled)
1218 * 1256 return NOTIFY_DONE;
1219 * If the first NMI handles both, the latter will be empty and daze 1257
1220 * the CPU. 1258 this_nmi = percpu_read(irq_stat.__nmi_count);
1221 */ 1259 if ((handled > 1) ||
1222 x86_pmu.handle_irq(regs); 1260 /* the next nmi could be a back-to-back nmi */
1261 ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
1262 (__get_cpu_var(pmu_nmi).handled > 1))) {
1263 /*
1264 * We could have two subsequent back-to-back nmis: The
1265 * first handles more than one counter, the 2nd
1266 * handles only one counter and the 3rd handles no
1267 * counter.
1268 *
1269 * This is the 2nd nmi because the previous was
1270 * handling more than one counter. We will mark the
1271 * next (3rd) and then drop it if unhandled.
1272 */
1273 __get_cpu_var(pmu_nmi).marked = this_nmi + 1;
1274 __get_cpu_var(pmu_nmi).handled = handled;
1275 }
1223 1276
1224 return NOTIFY_STOP; 1277 return NOTIFY_STOP;
1225} 1278}
@@ -1388,7 +1441,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
1388{ 1441{
1389 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1442 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1390 1443
1391 cpuc->group_flag |= PERF_EVENT_TXN_STARTED; 1444 cpuc->group_flag |= PERF_EVENT_TXN;
1392 cpuc->n_txn = 0; 1445 cpuc->n_txn = 0;
1393} 1446}
1394 1447
@@ -1401,7 +1454,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1401{ 1454{
1402 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1455 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1403 1456
1404 cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; 1457 cpuc->group_flag &= ~PERF_EVENT_TXN;
1405 /* 1458 /*
1406 * Truncate the collected events. 1459 * Truncate the collected events.
1407 */ 1460 */
@@ -1435,11 +1488,7 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
1435 */ 1488 */
1436 memcpy(cpuc->assign, assign, n*sizeof(int)); 1489 memcpy(cpuc->assign, assign, n*sizeof(int));
1437 1490
1438 /* 1491 cpuc->group_flag &= ~PERF_EVENT_TXN;
1439 * Clear out the txn count so that ->cancel_txn() which gets
1440 * run after ->commit_txn() doesn't undo things.
1441 */
1442 cpuc->n_txn = 0;
1443 1492
1444 return 0; 1493 return 0;
1445} 1494}
@@ -1607,8 +1656,6 @@ static const struct stacktrace_ops backtrace_ops = {
1607 .walk_stack = print_context_stack_bp, 1656 .walk_stack = print_context_stack_bp,
1608}; 1657};
1609 1658
1610#include "../dumpstack.h"
1611
1612static void 1659static void
1613perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1660perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1614{ 1661{
@@ -1730,22 +1777,6 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1730 return entry; 1777 return entry;
1731} 1778}
1732 1779
1733void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
1734{
1735 regs->ip = ip;
1736 /*
1737 * perf_arch_fetch_caller_regs adds another call, we need to increment
1738 * the skip level
1739 */
1740 regs->bp = rewind_frame_pointer(skip + 1);
1741 regs->cs = __KERNEL_CS;
1742 /*
1743 * We abuse bit 3 to pass exact information, see perf_misc_flags
1744 * and the comment with PERF_EFLAGS_EXACT.
1745 */
1746 regs->flags = 0;
1747}
1748
1749unsigned long perf_instruction_pointer(struct pt_regs *regs) 1780unsigned long perf_instruction_pointer(struct pt_regs *regs)
1750{ 1781{
1751 unsigned long ip; 1782 unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 214ac860ebe0..ee05c90012d2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added)
491 * Intel Errata AAP53 (model 30) 491 * Intel Errata AAP53 (model 30)
492 * Intel Errata BD53 (model 44) 492 * Intel Errata BD53 (model 44)
493 * 493 *
494 * These chips need to be 'reset' when adding counters by programming 494 * The official story:
495 * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5 495 * These chips need to be 'reset' when adding counters by programming the
496 * either in sequence on the same PMC or on different PMCs. 496 * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
497 * in sequence on the same PMC or on different PMCs.
498 *
499 * In practise it appears some of these events do in fact count, and
500 * we need to programm all 4 events.
497 */ 501 */
498static void intel_pmu_nhm_enable_all(int added) 502static void intel_pmu_nhm_workaround(void)
499{ 503{
500 if (added) { 504 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
501 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 505 static const unsigned long nhm_magic[4] = {
502 int i; 506 0x4300B5,
507 0x4300D2,
508 0x4300B1,
509 0x4300B1
510 };
511 struct perf_event *event;
512 int i;
513
514 /*
515 * The Errata requires below steps:
516 * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL;
517 * 2) Configure 4 PERFEVTSELx with the magic events and clear
518 * the corresponding PMCx;
519 * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL;
520 * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL;
521 * 5) Clear 4 pairs of ERFEVTSELx and PMCx;
522 */
503 523
504 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2); 524 /*
505 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1); 525 * The real steps we choose are a little different from above.
506 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5); 526 * A) To reduce MSR operations, we don't run step 1) as they
527 * are already cleared before this function is called;
528 * B) Call x86_perf_event_update to save PMCx before configuring
529 * PERFEVTSELx with magic number;
530 * C) With step 5), we do clear only when the PERFEVTSELx is
531 * not used currently.
532 * D) Call x86_perf_event_set_period to restore PMCx;
533 */
534
535 /* We always operate 4 pairs of PERF Counters */
536 for (i = 0; i < 4; i++) {
537 event = cpuc->events[i];
538 if (event)
539 x86_perf_event_update(event);
540 }
507 541
508 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); 542 for (i = 0; i < 4; i++) {
509 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); 543 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]);
544 wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0);
545 }
510 546
511 for (i = 0; i < 3; i++) { 547 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf);
512 struct perf_event *event = cpuc->events[i]; 548 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
513 549
514 if (!event) 550 for (i = 0; i < 4; i++) {
515 continue; 551 event = cpuc->events[i];
516 552
553 if (event) {
554 x86_perf_event_set_period(event);
517 __x86_pmu_enable_event(&event->hw, 555 __x86_pmu_enable_event(&event->hw,
518 ARCH_PERFMON_EVENTSEL_ENABLE); 556 ARCH_PERFMON_EVENTSEL_ENABLE);
519 } 557 } else
558 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0);
520 } 559 }
560}
561
562static void intel_pmu_nhm_enable_all(int added)
563{
564 if (added)
565 intel_pmu_nhm_workaround();
521 intel_pmu_enable_all(added); 566 intel_pmu_enable_all(added);
522} 567}
523 568
@@ -667,7 +712,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
667 struct perf_sample_data data; 712 struct perf_sample_data data;
668 struct cpu_hw_events *cpuc; 713 struct cpu_hw_events *cpuc;
669 int bit, loops; 714 int bit, loops;
670 u64 ack, status; 715 u64 status;
716 int handled = 0;
671 717
672 perf_sample_data_init(&data, 0); 718 perf_sample_data_init(&data, 0);
673 719
@@ -683,6 +729,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
683 729
684 loops = 0; 730 loops = 0;
685again: 731again:
732 intel_pmu_ack_status(status);
686 if (++loops > 100) { 733 if (++loops > 100) {
687 WARN_ONCE(1, "perfevents: irq loop stuck!\n"); 734 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
688 perf_event_print_debug(); 735 perf_event_print_debug();
@@ -691,19 +738,22 @@ again:
691 } 738 }
692 739
693 inc_irq_stat(apic_perf_irqs); 740 inc_irq_stat(apic_perf_irqs);
694 ack = status;
695 741
696 intel_pmu_lbr_read(); 742 intel_pmu_lbr_read();
697 743
698 /* 744 /*
699 * PEBS overflow sets bit 62 in the global status register 745 * PEBS overflow sets bit 62 in the global status register
700 */ 746 */
701 if (__test_and_clear_bit(62, (unsigned long *)&status)) 747 if (__test_and_clear_bit(62, (unsigned long *)&status)) {
748 handled++;
702 x86_pmu.drain_pebs(regs); 749 x86_pmu.drain_pebs(regs);
750 }
703 751
704 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 752 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
705 struct perf_event *event = cpuc->events[bit]; 753 struct perf_event *event = cpuc->events[bit];
706 754
755 handled++;
756
707 if (!test_bit(bit, cpuc->active_mask)) 757 if (!test_bit(bit, cpuc->active_mask))
708 continue; 758 continue;
709 759
@@ -716,8 +766,6 @@ again:
716 x86_pmu_stop(event); 766 x86_pmu_stop(event);
717 } 767 }
718 768
719 intel_pmu_ack_status(ack);
720
721 /* 769 /*
722 * Repeat if there is more work to be done: 770 * Repeat if there is more work to be done:
723 */ 771 */
@@ -727,7 +775,7 @@ again:
727 775
728done: 776done:
729 intel_pmu_enable_all(0); 777 intel_pmu_enable_all(0);
730 return 1; 778 return handled;
731} 779}
732 780
733static struct event_constraint * 781static struct event_constraint *
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae85d69644d1..249015173992 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -21,22 +21,36 @@ struct p4_event_bind {
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ 21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22}; 22};
23 23
24struct p4_cache_event_bind { 24struct p4_pebs_bind {
25 unsigned int metric_pebs; 25 unsigned int metric_pebs;
26 unsigned int metric_vert; 26 unsigned int metric_vert;
27}; 27};
28 28
29#define P4_GEN_CACHE_EVENT_BIND(name) \ 29/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
30 [P4_CACHE__##name] = { \ 30#define P4_GEN_PEBS_BIND(name, pebs, vert) \
31 .metric_pebs = P4_PEBS__##name, \ 31 [P4_PEBS_METRIC__##name] = { \
32 .metric_vert = P4_VERT__##name, \ 32 .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
33 .metric_vert = vert, \
33 } 34 }
34 35
35static struct p4_cache_event_bind p4_cache_event_bind_map[] = { 36/*
36 P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired), 37 * note we have P4_PEBS_ENABLE_UOP_TAG always set here
37 P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired), 38 *
38 P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired), 39 * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
39 P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired), 40 * event configuration to find out which values are to be
41 * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
42 * resgisters
43 */
44static struct p4_pebs_bind p4_pebs_bind_map[] = {
45 P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
46 P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
47 P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
48 P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
49 P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
50 P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
51 P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
52 P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
53 P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
40}; 54};
41 55
42/* 56/*
@@ -281,10 +295,10 @@ static struct p4_event_bind p4_event_bind_map[] = {
281 }, 295 },
282}; 296};
283 297
284#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \ 298#define P4_GEN_CACHE_EVENT(event, bit, metric) \
285 p4_config_pack_escr(P4_ESCR_EVENT(event) | \ 299 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
286 P4_ESCR_EMASK_BIT(event, bit)) | \ 300 P4_ESCR_EMASK_BIT(event, bit)) | \
287 p4_config_pack_cccr(cache_event | \ 301 p4_config_pack_cccr(metric | \
288 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) 302 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
289 303
290static __initconst const u64 p4_hw_cache_event_ids 304static __initconst const u64 p4_hw_cache_event_ids
@@ -296,34 +310,34 @@ static __initconst const u64 p4_hw_cache_event_ids
296 [ C(OP_READ) ] = { 310 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0, 311 [ C(RESULT_ACCESS) ] = 0x0,
298 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
299 P4_CACHE__1stl_cache_load_miss_retired), 313 P4_PEBS_METRIC__1stl_cache_load_miss_retired),
300 }, 314 },
301 }, 315 },
302 [ C(LL ) ] = { 316 [ C(LL ) ] = {
303 [ C(OP_READ) ] = { 317 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0, 318 [ C(RESULT_ACCESS) ] = 0x0,
305 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 319 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
306 P4_CACHE__2ndl_cache_load_miss_retired), 320 P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
307 }, 321 },
308}, 322},
309 [ C(DTLB) ] = { 323 [ C(DTLB) ] = {
310 [ C(OP_READ) ] = { 324 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0, 325 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 326 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_CACHE__dtlb_load_miss_retired), 327 P4_PEBS_METRIC__dtlb_load_miss_retired),
314 }, 328 },
315 [ C(OP_WRITE) ] = { 329 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0, 330 [ C(RESULT_ACCESS) ] = 0x0,
317 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, 331 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
318 P4_CACHE__dtlb_store_miss_retired), 332 P4_PEBS_METRIC__dtlb_store_miss_retired),
319 }, 333 },
320 }, 334 },
321 [ C(ITLB) ] = { 335 [ C(ITLB) ] = {
322 [ C(OP_READ) ] = { 336 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, 337 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
324 P4_CACHE__itlb_reference_hit), 338 P4_PEBS_METRIC__none),
325 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, 339 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
326 P4_CACHE__itlb_reference_miss), 340 P4_PEBS_METRIC__none),
327 }, 341 },
328 [ C(OP_WRITE) ] = { 342 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1, 343 [ C(RESULT_ACCESS) ] = -1,
@@ -414,11 +428,37 @@ static u64 p4_pmu_event_map(int hw_event)
414 return config; 428 return config;
415} 429}
416 430
431static int p4_validate_raw_event(struct perf_event *event)
432{
433 unsigned int v;
434
435 /* user data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) {
438 pr_warning("P4 PMU: Unknown event code: %d\n", v);
439 return -EINVAL;
440 }
441
442 /*
443 * it may have some screwed PEBS bits
444 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL;
448 }
449 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL;
453 }
454
455 return 0;
456}
457
417static int p4_hw_config(struct perf_event *event) 458static int p4_hw_config(struct perf_event *event)
418{ 459{
419 int cpu = get_cpu(); 460 int cpu = get_cpu();
420 int rc = 0; 461 int rc = 0;
421 unsigned int evnt;
422 u32 escr, cccr; 462 u32 escr, cccr;
423 463
424 /* 464 /*
@@ -438,12 +478,9 @@ static int p4_hw_config(struct perf_event *event)
438 478
439 if (event->attr.type == PERF_TYPE_RAW) { 479 if (event->attr.type == PERF_TYPE_RAW) {
440 480
441 /* user data may have out-of-bound event index */ 481 rc = p4_validate_raw_event(event);
442 evnt = p4_config_unpack_event(event->attr.config); 482 if (rc)
443 if (evnt >= ARRAY_SIZE(p4_event_bind_map)) {
444 rc = -EINVAL;
445 goto out; 483 goto out;
446 }
447 484
448 /* 485 /*
449 * We don't control raw events so it's up to the caller 486 * We don't control raw events so it's up to the caller
@@ -451,12 +488,17 @@ static int p4_hw_config(struct perf_event *event)
451 * on HT machine but allow HT-compatible specifics to be 488 * on HT machine but allow HT-compatible specifics to be
452 * passed on) 489 * passed on)
453 * 490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc)
493 *
454 * XXX: HT wide things should check perf_paranoid_cpu() && 494 * XXX: HT wide things should check perf_paranoid_cpu() &&
455 * CAP_SYS_ADMIN 495 * CAP_SYS_ADMIN
456 */ 496 */
457 event->hw.config |= event->attr.config & 497 event->hw.config |= event->attr.config &
458 (p4_config_pack_escr(P4_ESCR_MASK_HT) | 498 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
459 p4_config_pack_cccr(P4_CCCR_MASK_HT)); 499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
500
501 event->hw.config &= ~P4_CCCR_FORCE_OVF;
460 } 502 }
461 503
462 rc = x86_setup_perfctr(event); 504 rc = x86_setup_perfctr(event);
@@ -482,6 +524,29 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
482 return overflow; 524 return overflow;
483} 525}
484 526
527static void p4_pmu_disable_pebs(void)
528{
529 /*
530 * FIXME
531 *
532 * It's still allowed that two threads setup same cache
533 * events so we can't simply clear metrics until we knew
534 * noone is depending on us, so we need kind of counter
535 * for "ReplayEvent" users.
536 *
537 * What is more complex -- RAW events, if user (for some
538 * reason) will pass some cache event metric with improper
539 * event opcode -- it's fine from hardware point of view
540 * but completely nonsence from "meaning" of such action.
541 *
542 * So at moment let leave metrics turned on forever -- it's
543 * ok for now but need to be revisited!
544 *
545 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
546 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
547 */
548}
549
485static inline void p4_pmu_disable_event(struct perf_event *event) 550static inline void p4_pmu_disable_event(struct perf_event *event)
486{ 551{
487 struct hw_perf_event *hwc = &event->hw; 552 struct hw_perf_event *hwc = &event->hw;
@@ -507,6 +572,26 @@ static void p4_pmu_disable_all(void)
507 continue; 572 continue;
508 p4_pmu_disable_event(event); 573 p4_pmu_disable_event(event);
509 } 574 }
575
576 p4_pmu_disable_pebs();
577}
578
579/* configuration must be valid */
580static void p4_pmu_enable_pebs(u64 config)
581{
582 struct p4_pebs_bind *bind;
583 unsigned int idx;
584
585 BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
586
587 idx = p4_config_unpack_metric(config);
588 if (idx == P4_PEBS_METRIC__none)
589 return;
590
591 bind = &p4_pebs_bind_map[idx];
592
593 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
594 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
510} 595}
511 596
512static void p4_pmu_enable_event(struct perf_event *event) 597static void p4_pmu_enable_event(struct perf_event *event)
@@ -515,9 +600,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
515 int thread = p4_ht_config_thread(hwc->config); 600 int thread = p4_ht_config_thread(hwc->config);
516 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); 601 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
517 unsigned int idx = p4_config_unpack_event(hwc->config); 602 unsigned int idx = p4_config_unpack_event(hwc->config);
518 unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
519 struct p4_event_bind *bind; 603 struct p4_event_bind *bind;
520 struct p4_cache_event_bind *bind_cache;
521 u64 escr_addr, cccr; 604 u64 escr_addr, cccr;
522 605
523 bind = &p4_event_bind_map[idx]; 606 bind = &p4_event_bind_map[idx];
@@ -537,16 +620,10 @@ static void p4_pmu_enable_event(struct perf_event *event)
537 cccr = p4_config_unpack_cccr(hwc->config); 620 cccr = p4_config_unpack_cccr(hwc->config);
538 621
539 /* 622 /*
540 * it could be Cache event so that we need to 623 * it could be Cache event so we need to write metrics
541 * set metrics into additional MSRs 624 * into additional MSRs
542 */ 625 */
543 BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK); 626 p4_pmu_enable_pebs(hwc->config);
544 if (idx_cache > P4_CACHE__NONE &&
545 idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
546 bind_cache = &p4_cache_event_bind_map[idx_cache];
547 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
548 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
549 }
550 627
551 (void)checking_wrmsrl(escr_addr, escr_conf); 628 (void)checking_wrmsrl(escr_addr, escr_conf);
552 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 629 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
@@ -581,9 +658,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
581 cpuc = &__get_cpu_var(cpu_hw_events); 658 cpuc = &__get_cpu_var(cpu_hw_events);
582 659
583 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 660 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
661 int overflow;
584 662
585 if (!test_bit(idx, cpuc->active_mask)) 663 if (!test_bit(idx, cpuc->active_mask)) {
664 /* catch in-flight IRQs */
665 if (__test_and_clear_bit(idx, cpuc->running))
666 handled++;
586 continue; 667 continue;
668 }
587 669
588 event = cpuc->events[idx]; 670 event = cpuc->events[idx];
589 hwc = &event->hw; 671 hwc = &event->hw;
@@ -591,12 +673,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
591 WARN_ON_ONCE(hwc->idx != idx); 673 WARN_ON_ONCE(hwc->idx != idx);
592 674
593 /* it might be unflagged overflow */ 675 /* it might be unflagged overflow */
594 handled = p4_pmu_clear_cccr_ovf(hwc); 676 overflow = p4_pmu_clear_cccr_ovf(hwc);
595 677
596 val = x86_perf_event_update(event); 678 val = x86_perf_event_update(event);
597 if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) 679 if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
598 continue; 680 continue;
599 681
682 handled += overflow;
683
600 /* event overflow for sure */ 684 /* event overflow for sure */
601 data.period = event->hw.last_period; 685 data.period = event->hw.last_period;
602 686
@@ -829,6 +913,15 @@ static __initconst const struct x86_pmu p4_pmu = {
829 .max_period = (1ULL << 39) - 1, 913 .max_period = (1ULL << 39) - 1,
830 .hw_config = p4_hw_config, 914 .hw_config = p4_hw_config,
831 .schedule_events = p4_pmu_schedule_events, 915 .schedule_events = p4_pmu_schedule_events,
916 /*
917 * This handles erratum N15 in intel doc 249199-029,
918 * the counter may not be updated correctly on write
919 * so we need a second write operation to do the trick
920 * (the official workaround didn't work)
921 *
922 * the former idea is taken from OProfile code
923 */
924 .perfctr_second_write = 1,
832}; 925};
833 926
834static __init int p4_pmu_init(void) 927static __init int p4_pmu_init(void)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..d49079515122
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,64 @@
1/*
2 * Routines to indentify additional cpu features that are scattered in
3 * cpuid space.
4 */
5#include <linux/cpu.h>
6
7#include <asm/pat.h>
8#include <asm/processor.h>
9
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17 u32 sub_leaf;
18};
19
20enum cpuid_regs {
21 CR_EAX = 0,
22 CR_ECX,
23 CR_EDX,
24 CR_EBX
25};
26
27void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
28{
29 u32 max_level;
30 u32 regs[4];
31 const struct cpuid_bit *cb;
32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
38 { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
39 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
43 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
47 { 0, 0, 0, 0, 0 }
48 };
49
50 for (cb = cpuid_bits; cb->feature; cb++) {
51
52 /* Verify that the level is valid */
53 max_level = cpuid_eax(cb->level & 0xffff0000);
54 if (max_level < cb->level ||
55 max_level > (cb->level | 0xffff))
56 continue;
57
58 cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
59 &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
60
61 if (regs[cb->reg] & (1 << cb->bit))
62 set_cpu_cap(c, cb->feature);
63 }
64}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 10fa5684a662..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,62 +1,14 @@
1/* 1/*
2 * Routines to indentify additional cpu features that are scattered in 2 * Check for extended topology enumeration cpuid leaf 0xb and if it
3 * cpuid space. 3 * exists, use it for populating initial_apicid and cpu topology
4 * detection.
4 */ 5 */
5#include <linux/cpu.h>
6 6
7#include <linux/cpu.h>
8#include <asm/apic.h>
7#include <asm/pat.h> 9#include <asm/pat.h>
8#include <asm/processor.h> 10#include <asm/processor.h>
9 11
10#include <asm/apic.h>
11
12struct cpuid_bit {
13 u16 feature;
14 u8 reg;
15 u8 bit;
16 u32 level;
17};
18
19enum cpuid_regs {
20 CR_EAX = 0,
21 CR_ECX,
22 CR_EDX,
23 CR_EBX
24};
25
26void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
27{
28 u32 max_level;
29 u32 regs[4];
30 const struct cpuid_bit *cb;
31
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 },
36 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 },
37 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
38 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
39 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
40 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
41 { 0, 0, 0, 0 }
42 };
43
44 for (cb = cpuid_bits; cb->feature; cb++) {
45
46 /* Verify that the level is valid */
47 max_level = cpuid_eax(cb->level & 0xffff0000);
48 if (max_level < cb->level ||
49 max_level > (cb->level | 0xffff))
50 continue;
51
52 cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
53 &regs[CR_ECX], &regs[CR_EDX]);
54
55 if (regs[cb->reg] & (1 << cb->bit))
56 set_cpu_cap(c, cb->feature);
57 }
58}
59
60/* leaf 0xb SMT level */ 12/* leaf 0xb SMT level */
61#define SMT_LEVEL 0 13#define SMT_LEVEL 0
62 14
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index b9d1ff588445..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
51 51
52static unsigned long vmware_get_tsc_khz(void) 52static unsigned long vmware_get_tsc_khz(void)
53{ 53{
54 uint64_t tsc_hz; 54 uint64_t tsc_hz, lpj;
55 uint32_t eax, ebx, ecx, edx; 55 uint32_t eax, ebx, ecx, edx;
56 56
57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 57 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,6 +62,13 @@ static unsigned long vmware_get_tsc_khz(void)
62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", 62 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
63 (unsigned long) tsc_hz / 1000, 63 (unsigned long) tsc_hz / 1000,
64 (unsigned long) tsc_hz % 1000); 64 (unsigned long) tsc_hz % 1000);
65
66 if (!preset_lpj) {
67 lpj = ((u64)tsc_hz * 1000);
68 do_div(lpj, HZ);
69 preset_lpj = lpj;
70 }
71
65 return tsc_hz; 72 return tsc_hz;
66} 73}
67 74
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ebd4c51d096a..764c7c2b1811 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,8 @@
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30 30
31int in_crash_kexec;
32
31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
32 34
33static void kdump_nmi_callback(int cpu, struct die_args *args) 35static void kdump_nmi_callback(int cpu, struct die_args *args)
@@ -61,6 +63,7 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
61 63
62static void kdump_nmi_shootdown_cpus(void) 64static void kdump_nmi_shootdown_cpus(void)
63{ 65{
66 in_crash_kexec = 1;
64 nmi_shootdown_cpus(kdump_nmi_callback); 67 nmi_shootdown_cpus(kdump_nmi_callback);
65 68
66 disable_local_APIC(); 69 disable_local_APIC();
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index c89a386930b7..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
18 18
19#include <asm/stacktrace.h> 19#include <asm/stacktrace.h>
20 20
21#include "dumpstack.h"
22 21
23int panic_on_unrecovered_nmi; 22int panic_on_unrecovered_nmi;
24int panic_on_io_nmi; 23int panic_on_io_nmi;
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17#include <linux/uaccess.h>
18
19extern void
20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
21 unsigned long *stack, unsigned long bp, char *log_lvl);
22
23extern void
24show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *sp, unsigned long bp, char *log_lvl);
26
27extern unsigned int code_bytes;
28
29/* The form of the top of the frame on the stack */
30struct stack_frame {
31 struct stack_frame *next_frame;
32 unsigned long return_address;
33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..0f6376ffa2d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20
21 19
22void dump_trace(struct task_struct *task, struct pt_regs *regs, 20void dump_trace(struct task_struct *task, struct pt_regs *regs,
23 unsigned long *stack, unsigned long bp, 21 unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..57a21f11c791 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h"
20 19
21#define N_EXCEPTION_STACKS_END \ 20#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) 21 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index e5cc7e82e60d..ebdb85cf2686 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,7 +18,6 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/iommu.h> 19#include <asm/iommu.h>
20#include <asm/gart.h> 20#include <asm/gart.h>
21#include <asm/hpet.h>
22 21
23static void __init fix_hypertransport_config(int num, int slot, int func) 22static void __init fix_hypertransport_config(int num, int slot, int func)
24{ 23{
@@ -192,21 +191,6 @@ static void __init ati_bugs_contd(int num, int slot, int func)
192} 191}
193#endif 192#endif
194 193
195/*
196 * Force the read back of the CMP register in hpet_next_event()
197 * to work around the problem that the CMP register write seems to be
198 * delayed. See hpet_next_event() for details.
199 *
200 * We do this on all SMBUS incarnations for now until we have more
201 * information about the affected chipsets.
202 */
203static void __init ati_hpet_bugs(int num, int slot, int func)
204{
205#ifdef CONFIG_HPET_TIMER
206 hpet_readback_cmp = 1;
207#endif
208}
209
210#define QFLAG_APPLY_ONCE 0x1 194#define QFLAG_APPLY_ONCE 0x1
211#define QFLAG_APPLIED 0x2 195#define QFLAG_APPLIED 0x2
212#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 196#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -236,8 +220,6 @@ static struct chipset early_qrk[] __initdata = {
236 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, 220 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
237 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, 221 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
238 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, 222 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
239 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
240 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
241 {} 223 {}
242}; 224};
243 225
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index cd49141cf153..227d00920d2f 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -611,14 +611,14 @@ ldt_ss:
611 * compensating for the offset by changing to the ESPFIX segment with 611 * compensating for the offset by changing to the ESPFIX segment with
612 * a base address that matches for the difference. 612 * a base address that matches for the difference.
613 */ 613 */
614#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
614 mov %esp, %edx /* load kernel esp */ 615 mov %esp, %edx /* load kernel esp */
615 mov PT_OLDESP(%esp), %eax /* load userspace esp */ 616 mov PT_OLDESP(%esp), %eax /* load userspace esp */
616 mov %dx, %ax /* eax: new kernel esp */ 617 mov %dx, %ax /* eax: new kernel esp */
617 sub %eax, %edx /* offset (low word is 0) */ 618 sub %eax, %edx /* offset (low word is 0) */
618 PER_CPU(gdt_page, %ebx)
619 shr $16, %edx 619 shr $16, %edx
620 mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ 620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ 621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 622 pushl $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 623 CFI_ADJUST_CFA_OFFSET 4
624 push %eax /* new kernel esp */ 624 push %eax /* new kernel esp */
@@ -791,9 +791,8 @@ ptregs_clone:
791 * normal stack and adjusts ESP with the matching offset. 791 * normal stack and adjusts ESP with the matching offset.
792 */ 792 */
793 /* fixup the stack */ 793 /* fixup the stack */
794 PER_CPU(gdt_page, %ebx) 794 mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
795 mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ 795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
797 shl $16, %eax 796 shl $16, %eax
798 addl %esp, %eax /* the adjusted stack pointer */ 797 addl %esp, %eax /* the adjusted stack pointer */
799 pushl $__KERNEL_DS 798 pushl $__KERNEL_DS
@@ -914,7 +913,7 @@ ENTRY(simd_coprocessor_error)
914 .balign 4 913 .balign 4
915 .long 661b 914 .long 661b
916 .long 663f 915 .long 663f
917 .byte X86_FEATURE_XMM 916 .word X86_FEATURE_XMM
918 .byte 662b-661b 917 .byte 662b-661b
919 .byte 664f-663f 918 .byte 664f-663f
920.previous 919.previous
@@ -1166,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
1166.previous 1165.previous
1167ENDPROC(xen_failsafe_callback) 1166ENDPROC(xen_failsafe_callback)
1168 1167
1168BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
1169 xen_evtchn_do_upcall)
1170
1169#endif /* CONFIG_XEN */ 1171#endif /* CONFIG_XEN */
1170 1172
1171#ifdef CONFIG_FUNCTION_TRACER 1173#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4db7c4d12ffa..17be5ec7cbba 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
1065END(\sym) 1065END(\sym)
1066.endm 1066.endm
1067 1067
1068#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1068.macro paranoidzeroentry_ist sym do_sym ist 1069.macro paranoidzeroentry_ist sym do_sym ist
1069ENTRY(\sym) 1070ENTRY(\sym)
1070 INTR_FRAME 1071 INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
1076 TRACE_IRQS_OFF 1077 TRACE_IRQS_OFF
1077 movq %rsp,%rdi /* pt_regs pointer */ 1078 movq %rsp,%rdi /* pt_regs pointer */
1078 xorl %esi,%esi /* no error code */ 1079 xorl %esi,%esi /* no error code */
1079 PER_CPU(init_tss, %r12) 1080 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1080 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1081 call \do_sym 1081 call \do_sym
1082 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) 1082 addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1083 jmp paranoid_exit /* %ebx: no swapgs flag */ 1083 jmp paranoid_exit /* %ebx: no swapgs flag */
1084 CFI_ENDPROC 1084 CFI_ENDPROC
1085END(\sym) 1085END(\sym)
@@ -1185,13 +1185,13 @@ END(kernel_thread_helper)
1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1185 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1186 * 1186 *
1187 * C extern interface: 1187 * C extern interface:
1188 * extern long execve(char *name, char **argv, char **envp) 1188 * extern long execve(const char *name, char **argv, char **envp)
1189 * 1189 *
1190 * asm input arguments: 1190 * asm input arguments:
1191 * rdi: name, rsi: argv, rdx: envp 1191 * rdi: name, rsi: argv, rdx: envp
1192 * 1192 *
1193 * We want to fallback into: 1193 * We want to fallback into:
1194 * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs) 1194 * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1195 * 1195 *
1196 * do_sys_execve asm fallback arguments: 1196 * do_sys_execve asm fallback arguments:
1197 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack 1197 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
1329 CFI_ENDPROC 1329 CFI_ENDPROC
1330END(xen_failsafe_callback) 1330END(xen_failsafe_callback)
1331 1331
1332apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1333 xen_hvm_callback_vector xen_evtchn_do_upcall
1334
1332#endif /* CONFIG_XEN */ 1335#endif /* CONFIG_XEN */
1333 1336
1334/* 1337/*
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..fa8c1b8e09fb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
131 movsl 131 movsl
1321: 1321:
133 133
134#ifdef CONFIG_OLPC_OPENFIRMWARE
135 /* save OFW's pgdir table for later use when calling into OFW */
136 movl %cr3, %eax
137 movl %eax, pa(olpc_ofw_pgd)
138#endif
139
134#ifdef CONFIG_PARAVIRT 140#ifdef CONFIG_PARAVIRT
135 /* This is can only trip for a broken bootloader... */ 141 /* This is can only trip for a broken bootloader... */
136 cmpw $0x207, pa(boot_params + BP_version) 142 cmpw $0x207, pa(boot_params + BP_version)
@@ -328,7 +334,7 @@ ENTRY(startup_32_smp)
328/* 334/*
329 * Enable paging 335 * Enable paging
330 */ 336 */
331 movl $pa(swapper_pg_dir),%eax 337 movl pa(initial_page_table), %eax
332 movl %eax,%cr3 /* set the page table pointer.. */ 338 movl %eax,%cr3 /* set the page table pointer.. */
333 movl %cr0,%eax 339 movl %cr0,%eax
334 orl $X86_CR0_PG,%eax 340 orl $X86_CR0_PG,%eax
@@ -608,6 +614,8 @@ ignore_int:
608.align 4 614.align 4
609ENTRY(initial_code) 615ENTRY(initial_code)
610 .long i386_start_kernel 616 .long i386_start_kernel
617ENTRY(initial_page_table)
618 .long pa(swapper_pg_dir)
611 619
612/* 620/*
613 * BSS section 621 * BSS section
@@ -623,6 +631,10 @@ ENTRY(swapper_pg_dir)
623#endif 631#endif
624swapper_pg_fixmap: 632swapper_pg_fixmap:
625 .fill 1024,4,0 633 .fill 1024,4,0
634#ifdef CONFIG_X86_TRAMPOLINE
635ENTRY(trampoline_pg_dir)
636 .fill 1024,4,0
637#endif
626ENTRY(empty_zero_page) 638ENTRY(empty_zero_page)
627 .fill 4096,1,0 639 .fill 4096,1,0
628 640
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
234 * init data section till per cpu areas are set up. 234 * init data section till per cpu areas are set up.
235 */ 235 */
236 movl $MSR_GS_BASE,%ecx 236 movl $MSR_GS_BASE,%ecx
237 movq initial_gs(%rip),%rax 237 movl initial_gs(%rip),%eax
238 movq %rax,%rdx 238 movl initial_gs+4(%rip),%edx
239 shrq $32,%rdx
240 wrmsr 239 wrmsr
241 240
242 /* esi is pointer to real mode structure with interesting info. 241 /* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ba390d731175..7494999141b3 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
16#include <asm/hpet.h> 16#include <asm/hpet.h>
17 17
18#define HPET_MASK CLOCKSOURCE_MASK(32) 18#define HPET_MASK CLOCKSOURCE_MASK(32)
19#define HPET_SHIFT 22
20 19
21/* FSEC = 10^-15 20/* FSEC = 10^-15
22 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
@@ -36,7 +35,6 @@
36unsigned long hpet_address; 35unsigned long hpet_address;
37u8 hpet_blockid; /* OS timer block num */ 36u8 hpet_blockid; /* OS timer block num */
38u8 hpet_msi_disable; 37u8 hpet_msi_disable;
39u8 hpet_readback_cmp;
40 38
41#ifdef CONFIG_PCI_MSI 39#ifdef CONFIG_PCI_MSI
42static unsigned long hpet_num_timers; 40static unsigned long hpet_num_timers;
@@ -396,23 +394,27 @@ static int hpet_next_event(unsigned long delta,
396 * at that point and we would wait for the next hpet interrupt 394 * at that point and we would wait for the next hpet interrupt
397 * forever. We found out that reading the CMP register back 395 * forever. We found out that reading the CMP register back
398 * forces the transfer so we can rely on the comparison with 396 * forces the transfer so we can rely on the comparison with
399 * the counter register below. 397 * the counter register below. If the read back from the
398 * compare register does not match the value we programmed
399 * then we might have a real hardware problem. We can not do
400 * much about it here, but at least alert the user/admin with
401 * a prominent warning.
400 * 402 *
401 * That works fine on those ATI chipsets, but on newer Intel 403 * An erratum on some chipsets (ICH9,..), results in
402 * chipsets (ICH9...) this triggers due to an erratum: Reading 404 * comparator read immediately following a write returning old
403 * the comparator immediately following a write is returning 405 * value. Workaround for this is to read this value second
404 * the old value. 406 * time, when first read returns old value.
405 * 407 *
406 * We restrict the read back to the affected ATI chipsets (set 408 * In fact the write to the comparator register is delayed up
407 * by quirks) and also run it with hpet=verbose for debugging 409 * to two HPET cycles so the workaround we tried to restrict
408 * purposes. 410 * the readback to those known to be borked ATI chipsets
411 * failed miserably. So we give up on optimizations forever
412 * and penalize all HPET incarnations unconditionally.
409 */ 413 */
410 if (hpet_readback_cmp || hpet_verbose) { 414 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
411 u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); 415 if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
412
413 if (cmp != cnt)
414 printk_once(KERN_WARNING 416 printk_once(KERN_WARNING
415 "hpet: compare register read back failed.\n"); 417 "hpet: compare register read back failed.\n");
416 } 418 }
417 419
418 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 420 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
@@ -504,7 +506,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
504{ 506{
505 unsigned int irq; 507 unsigned int irq;
506 508
507 irq = create_irq(); 509 irq = create_irq_nr(0, -1);
508 if (!irq) 510 if (!irq)
509 return -EINVAL; 511 return -EINVAL;
510 512
@@ -583,7 +585,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
583 * scaled math multiplication factor for nanosecond to hpet tick 585 * scaled math multiplication factor for nanosecond to hpet tick
584 * conversion. 586 * conversion.
585 */ 587 */
586 hpet_freq = 1000000000000000ULL; 588 hpet_freq = FSEC_PER_SEC;
587 do_div(hpet_freq, hpet_period); 589 do_div(hpet_freq, hpet_period);
588 evt->mult = div_sc((unsigned long) hpet_freq, 590 evt->mult = div_sc((unsigned long) hpet_freq,
589 NSEC_PER_SEC, evt->shift); 591 NSEC_PER_SEC, evt->shift);
@@ -787,7 +789,6 @@ static struct clocksource clocksource_hpet = {
787 .rating = 250, 789 .rating = 250,
788 .read = read_hpet, 790 .read = read_hpet,
789 .mask = HPET_MASK, 791 .mask = HPET_MASK,
790 .shift = HPET_SHIFT,
791 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 792 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
792 .resume = hpet_resume_counter, 793 .resume = hpet_resume_counter,
793#ifdef CONFIG_X86_64 794#ifdef CONFIG_X86_64
@@ -798,6 +799,7 @@ static struct clocksource clocksource_hpet = {
798static int hpet_clocksource_register(void) 799static int hpet_clocksource_register(void)
799{ 800{
800 u64 start, now; 801 u64 start, now;
802 u64 hpet_freq;
801 cycle_t t1; 803 cycle_t t1;
802 804
803 /* Start the counter */ 805 /* Start the counter */
@@ -832,9 +834,15 @@ static int hpet_clocksource_register(void)
832 * mult = (hpet_period * 2^shift)/10^6 834 * mult = (hpet_period * 2^shift)/10^6
833 * mult = (hpet_period << shift)/FSEC_PER_NSEC 835 * mult = (hpet_period << shift)/FSEC_PER_NSEC
834 */ 836 */
835 clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
836 837
837 clocksource_register(&clocksource_hpet); 838 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
839 *
840 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
841 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
842 */
843 hpet_freq = FSEC_PER_SEC;
844 do_div(hpet_freq, hpet_period);
845 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
838 846
839 return 0; 847 return 0;
840} 848}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index a8f1b803d2fd..ff15c9dcc25d 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -206,6 +206,25 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
206int arch_bp_generic_fields(int x86_len, int x86_type, 206int arch_bp_generic_fields(int x86_len, int x86_type,
207 int *gen_len, int *gen_type) 207 int *gen_len, int *gen_type)
208{ 208{
209 /* Type */
210 switch (x86_type) {
211 case X86_BREAKPOINT_EXECUTE:
212 if (x86_len != X86_BREAKPOINT_LEN_X)
213 return -EINVAL;
214
215 *gen_type = HW_BREAKPOINT_X;
216 *gen_len = sizeof(long);
217 return 0;
218 case X86_BREAKPOINT_WRITE:
219 *gen_type = HW_BREAKPOINT_W;
220 break;
221 case X86_BREAKPOINT_RW:
222 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
223 break;
224 default:
225 return -EINVAL;
226 }
227
209 /* Len */ 228 /* Len */
210 switch (x86_len) { 229 switch (x86_len) {
211 case X86_BREAKPOINT_LEN_1: 230 case X86_BREAKPOINT_LEN_1:
@@ -226,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
226 return -EINVAL; 245 return -EINVAL;
227 } 246 }
228 247
229 /* Type */
230 switch (x86_type) {
231 case X86_BREAKPOINT_EXECUTE:
232 *gen_type = HW_BREAKPOINT_X;
233 break;
234 case X86_BREAKPOINT_WRITE:
235 *gen_type = HW_BREAKPOINT_W;
236 break;
237 case X86_BREAKPOINT_RW:
238 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
239 break;
240 default:
241 return -EINVAL;
242 }
243
244 return 0; 248 return 0;
245} 249}
246 250
@@ -251,6 +255,29 @@ static int arch_build_bp_info(struct perf_event *bp)
251 255
252 info->address = bp->attr.bp_addr; 256 info->address = bp->attr.bp_addr;
253 257
258 /* Type */
259 switch (bp->attr.bp_type) {
260 case HW_BREAKPOINT_W:
261 info->type = X86_BREAKPOINT_WRITE;
262 break;
263 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
264 info->type = X86_BREAKPOINT_RW;
265 break;
266 case HW_BREAKPOINT_X:
267 info->type = X86_BREAKPOINT_EXECUTE;
268 /*
269 * x86 inst breakpoints need to have a specific undefined len.
270 * But we still need to check userspace is not trying to setup
271 * an unsupported length, to get a range breakpoint for example.
272 */
273 if (bp->attr.bp_len == sizeof(long)) {
274 info->len = X86_BREAKPOINT_LEN_X;
275 return 0;
276 }
277 default:
278 return -EINVAL;
279 }
280
254 /* Len */ 281 /* Len */
255 switch (bp->attr.bp_len) { 282 switch (bp->attr.bp_len) {
256 case HW_BREAKPOINT_LEN_1: 283 case HW_BREAKPOINT_LEN_1:
@@ -271,21 +298,6 @@ static int arch_build_bp_info(struct perf_event *bp)
271 return -EINVAL; 298 return -EINVAL;
272 } 299 }
273 300
274 /* Type */
275 switch (bp->attr.bp_type) {
276 case HW_BREAKPOINT_W:
277 info->type = X86_BREAKPOINT_WRITE;
278 break;
279 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
280 info->type = X86_BREAKPOINT_RW;
281 break;
282 case HW_BREAKPOINT_X:
283 info->type = X86_BREAKPOINT_EXECUTE;
284 break;
285 default:
286 return -EINVAL;
287 }
288
289 return 0; 301 return 0;
290} 302}
291/* 303/*
@@ -466,6 +478,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
466 478
467 perf_bp_event(bp, args->regs); 479 perf_bp_event(bp, args->regs);
468 480
481 /*
482 * Set up resume flag to avoid breakpoint recursion when
483 * returning back to origin.
484 */
485 if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
486 args->regs->flags |= X86_EFLAGS_RF;
487
469 rcu_read_unlock(); 488 rcu_read_unlock();
470 } 489 }
471 /* 490 /*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index c4444bce8469..a46cb3522c0c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -40,6 +40,7 @@
40 40
41static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 41static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
42unsigned int xstate_size; 42unsigned int xstate_size;
43EXPORT_SYMBOL_GPL(xstate_size);
43unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); 44unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
44static struct i387_fxsave_struct fx_scratch __cpuinitdata; 45static struct i387_fxsave_struct fx_scratch __cpuinitdata;
45 46
@@ -59,18 +60,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
59 stts(); 60 stts();
60} 61}
61 62
62void __cpuinit init_thread_xstate(void) 63static void __cpuinit init_thread_xstate(void)
63{ 64{
65 /*
66 * Note that xstate_size might be overwriten later during
67 * xsave_init().
68 */
69
64 if (!HAVE_HWFP) { 70 if (!HAVE_HWFP) {
65 xstate_size = sizeof(struct i387_soft_struct); 71 xstate_size = sizeof(struct i387_soft_struct);
66 return; 72 return;
67 } 73 }
68 74
69 if (cpu_has_xsave) {
70 xsave_cntxt_init();
71 return;
72 }
73
74 if (cpu_has_fxsr) 75 if (cpu_has_fxsr)
75 xstate_size = sizeof(struct i387_fxsave_struct); 76 xstate_size = sizeof(struct i387_fxsave_struct);
76#ifdef CONFIG_X86_32 77#ifdef CONFIG_X86_32
@@ -84,6 +85,7 @@ void __cpuinit init_thread_xstate(void)
84 * Called at bootup to set up the initial FPU state that is later cloned 85 * Called at bootup to set up the initial FPU state that is later cloned
85 * into all processes. 86 * into all processes.
86 */ 87 */
88
87void __cpuinit fpu_init(void) 89void __cpuinit fpu_init(void)
88{ 90{
89 unsigned long oldcr0 = read_cr0(); 91 unsigned long oldcr0 = read_cr0();
@@ -93,19 +95,24 @@ void __cpuinit fpu_init(void)
93 95
94 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 96 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
95 97
96 /*
97 * Boot processor to setup the FP and extended state context info.
98 */
99 if (!smp_processor_id()) 98 if (!smp_processor_id())
100 init_thread_xstate(); 99 init_thread_xstate();
101 xsave_init();
102 100
103 mxcsr_feature_mask_init(); 101 mxcsr_feature_mask_init();
104 /* clean state in init */ 102 /* clean state in init */
105 current_thread_info()->status = 0; 103 current_thread_info()->status = 0;
106 clear_used_math(); 104 clear_used_math();
107} 105}
108#endif /* CONFIG_X86_64 */ 106
107#else /* CONFIG_X86_64 */
108
109void __cpuinit fpu_init(void)
110{
111 if (!smp_processor_id())
112 init_thread_xstate();
113}
114
115#endif /* CONFIG_X86_32 */
109 116
110void fpu_finit(struct fpu *fpu) 117void fpu_finit(struct fpu *fpu)
111{ 118{
@@ -191,6 +198,8 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
191 if (ret) 198 if (ret)
192 return ret; 199 return ret;
193 200
201 sanitize_i387_state(target);
202
194 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, 203 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
195 &target->thread.fpu.state->fxsave, 0, -1); 204 &target->thread.fpu.state->fxsave, 0, -1);
196} 205}
@@ -208,6 +217,8 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
208 if (ret) 217 if (ret)
209 return ret; 218 return ret;
210 219
220 sanitize_i387_state(target);
221
211 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 222 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
212 &target->thread.fpu.state->fxsave, 0, -1); 223 &target->thread.fpu.state->fxsave, 0, -1);
213 224
@@ -447,6 +458,8 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
447 -1); 458 -1);
448 } 459 }
449 460
461 sanitize_i387_state(target);
462
450 if (kbuf && pos == 0 && count == sizeof(env)) { 463 if (kbuf && pos == 0 && count == sizeof(env)) {
451 convert_from_fxsr(kbuf, target); 464 convert_from_fxsr(kbuf, target);
452 return 0; 465 return 0;
@@ -468,6 +481,8 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
468 if (ret) 481 if (ret)
469 return ret; 482 return ret;
470 483
484 sanitize_i387_state(target);
485
471 if (!HAVE_HWFP) 486 if (!HAVE_HWFP)
472 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 487 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
473 488
@@ -534,6 +549,9 @@ static int save_i387_xsave(void __user *buf)
534 struct _fpstate_ia32 __user *fx = buf; 549 struct _fpstate_ia32 __user *fx = buf;
535 int err = 0; 550 int err = 0;
536 551
552
553 sanitize_i387_state(tsk);
554
537 /* 555 /*
538 * For legacy compatible, we always set FP/SSE bits in the bit 556 * For legacy compatible, we always set FP/SSE bits in the bit
539 * vector while saving the state to the user context. 557 * vector while saving the state to the user context.
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 01ab17ae2ae7..852b81967a37 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -49,55 +49,94 @@
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51 51
52/** 52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53 * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
54 * @gdb_regs: A pointer to hold the registers in the order GDB wants.
55 * @regs: The &struct pt_regs of the current process.
56 *
57 * Convert the pt_regs in @regs into the format for registers that
58 * GDB expects, stored in @gdb_regs.
59 */
60void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
61{ 53{
62#ifndef CONFIG_X86_32 54#ifdef CONFIG_X86_32
63 u32 *gdb_regs32 = (u32 *)gdb_regs; 55 { "ax", 4, offsetof(struct pt_regs, ax) },
56 { "cx", 4, offsetof(struct pt_regs, cx) },
57 { "dx", 4, offsetof(struct pt_regs, dx) },
58 { "bx", 4, offsetof(struct pt_regs, bx) },
59 { "sp", 4, offsetof(struct pt_regs, sp) },
60 { "bp", 4, offsetof(struct pt_regs, bp) },
61 { "si", 4, offsetof(struct pt_regs, si) },
62 { "di", 4, offsetof(struct pt_regs, di) },
63 { "ip", 4, offsetof(struct pt_regs, ip) },
64 { "flags", 4, offsetof(struct pt_regs, flags) },
65 { "cs", 4, offsetof(struct pt_regs, cs) },
66 { "ss", 4, offsetof(struct pt_regs, ss) },
67 { "ds", 4, offsetof(struct pt_regs, ds) },
68 { "es", 4, offsetof(struct pt_regs, es) },
69 { "fs", 4, -1 },
70 { "gs", 4, -1 },
71#else
72 { "ax", 8, offsetof(struct pt_regs, ax) },
73 { "bx", 8, offsetof(struct pt_regs, bx) },
74 { "cx", 8, offsetof(struct pt_regs, cx) },
75 { "dx", 8, offsetof(struct pt_regs, dx) },
76 { "si", 8, offsetof(struct pt_regs, dx) },
77 { "di", 8, offsetof(struct pt_regs, di) },
78 { "bp", 8, offsetof(struct pt_regs, bp) },
79 { "sp", 8, offsetof(struct pt_regs, sp) },
80 { "r8", 8, offsetof(struct pt_regs, r8) },
81 { "r9", 8, offsetof(struct pt_regs, r9) },
82 { "r10", 8, offsetof(struct pt_regs, r10) },
83 { "r11", 8, offsetof(struct pt_regs, r11) },
84 { "r12", 8, offsetof(struct pt_regs, r12) },
85 { "r13", 8, offsetof(struct pt_regs, r13) },
86 { "r14", 8, offsetof(struct pt_regs, r14) },
87 { "r15", 8, offsetof(struct pt_regs, r15) },
88 { "ip", 8, offsetof(struct pt_regs, ip) },
89 { "flags", 4, offsetof(struct pt_regs, flags) },
90 { "cs", 4, offsetof(struct pt_regs, cs) },
91 { "ss", 4, offsetof(struct pt_regs, ss) },
64#endif 92#endif
65 gdb_regs[GDB_AX] = regs->ax; 93};
66 gdb_regs[GDB_BX] = regs->bx; 94
67 gdb_regs[GDB_CX] = regs->cx; 95int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
68 gdb_regs[GDB_DX] = regs->dx; 96{
69 gdb_regs[GDB_SI] = regs->si; 97 if (
70 gdb_regs[GDB_DI] = regs->di;
71 gdb_regs[GDB_BP] = regs->bp;
72 gdb_regs[GDB_PC] = regs->ip;
73#ifdef CONFIG_X86_32 98#ifdef CONFIG_X86_32
74 gdb_regs[GDB_PS] = regs->flags; 99 regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
75 gdb_regs[GDB_DS] = regs->ds; 100#endif
76 gdb_regs[GDB_ES] = regs->es; 101 regno == GDB_SP || regno == GDB_ORIG_AX)
77 gdb_regs[GDB_CS] = regs->cs; 102 return 0;
78 gdb_regs[GDB_FS] = 0xFFFF; 103
79 gdb_regs[GDB_GS] = 0xFFFF; 104 if (dbg_reg_def[regno].offset != -1)
80 if (user_mode_vm(regs)) { 105 memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
81 gdb_regs[GDB_SS] = regs->ss; 106 dbg_reg_def[regno].size);
82 gdb_regs[GDB_SP] = regs->sp; 107 return 0;
83 } else { 108}
84 gdb_regs[GDB_SS] = __KERNEL_DS; 109
85 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 110char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
111{
112 if (regno == GDB_ORIG_AX) {
113 memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
114 return "orig_ax";
86 } 115 }
87#else 116 if (regno >= DBG_MAX_REG_NUM || regno < 0)
88 gdb_regs[GDB_R8] = regs->r8; 117 return NULL;
89 gdb_regs[GDB_R9] = regs->r9; 118
90 gdb_regs[GDB_R10] = regs->r10; 119 if (dbg_reg_def[regno].offset != -1)
91 gdb_regs[GDB_R11] = regs->r11; 120 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
92 gdb_regs[GDB_R12] = regs->r12; 121 dbg_reg_def[regno].size);
93 gdb_regs[GDB_R13] = regs->r13; 122
94 gdb_regs[GDB_R14] = regs->r14; 123 switch (regno) {
95 gdb_regs[GDB_R15] = regs->r15; 124#ifdef CONFIG_X86_32
96 gdb_regs32[GDB_PS] = regs->flags; 125 case GDB_SS:
97 gdb_regs32[GDB_CS] = regs->cs; 126 if (!user_mode_vm(regs))
98 gdb_regs32[GDB_SS] = regs->ss; 127 *(unsigned long *)mem = __KERNEL_DS;
99 gdb_regs[GDB_SP] = kernel_stack_pointer(regs); 128 break;
129 case GDB_SP:
130 if (!user_mode_vm(regs))
131 *(unsigned long *)mem = kernel_stack_pointer(regs);
132 break;
133 case GDB_GS:
134 case GDB_FS:
135 *(unsigned long *)mem = 0xFFFF;
136 break;
100#endif 137#endif
138 }
139 return dbg_reg_def[regno].name;
101} 140}
102 141
103/** 142/**
@@ -150,54 +189,13 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
150 gdb_regs[GDB_SP] = p->thread.sp; 189 gdb_regs[GDB_SP] = p->thread.sp;
151} 190}
152 191
153/**
154 * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
155 * @gdb_regs: A pointer to hold the registers we've received from GDB.
156 * @regs: A pointer to a &struct pt_regs to hold these values in.
157 *
158 * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
159 * in @regs.
160 */
161void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
162{
163#ifndef CONFIG_X86_32
164 u32 *gdb_regs32 = (u32 *)gdb_regs;
165#endif
166 regs->ax = gdb_regs[GDB_AX];
167 regs->bx = gdb_regs[GDB_BX];
168 regs->cx = gdb_regs[GDB_CX];
169 regs->dx = gdb_regs[GDB_DX];
170 regs->si = gdb_regs[GDB_SI];
171 regs->di = gdb_regs[GDB_DI];
172 regs->bp = gdb_regs[GDB_BP];
173 regs->ip = gdb_regs[GDB_PC];
174#ifdef CONFIG_X86_32
175 regs->flags = gdb_regs[GDB_PS];
176 regs->ds = gdb_regs[GDB_DS];
177 regs->es = gdb_regs[GDB_ES];
178 regs->cs = gdb_regs[GDB_CS];
179#else
180 regs->r8 = gdb_regs[GDB_R8];
181 regs->r9 = gdb_regs[GDB_R9];
182 regs->r10 = gdb_regs[GDB_R10];
183 regs->r11 = gdb_regs[GDB_R11];
184 regs->r12 = gdb_regs[GDB_R12];
185 regs->r13 = gdb_regs[GDB_R13];
186 regs->r14 = gdb_regs[GDB_R14];
187 regs->r15 = gdb_regs[GDB_R15];
188 regs->flags = gdb_regs32[GDB_PS];
189 regs->cs = gdb_regs32[GDB_CS];
190 regs->ss = gdb_regs32[GDB_SS];
191#endif
192}
193
194static struct hw_breakpoint { 192static struct hw_breakpoint {
195 unsigned enabled; 193 unsigned enabled;
196 unsigned long addr; 194 unsigned long addr;
197 int len; 195 int len;
198 int type; 196 int type;
199 struct perf_event **pev; 197 struct perf_event * __percpu *pev;
200} breakinfo[4]; 198} breakinfo[HBP_NUM];
201 199
202static unsigned long early_dr7; 200static unsigned long early_dr7;
203 201
@@ -205,7 +203,7 @@ static void kgdb_correct_hw_break(void)
205{ 203{
206 int breakno; 204 int breakno;
207 205
208 for (breakno = 0; breakno < 4; breakno++) { 206 for (breakno = 0; breakno < HBP_NUM; breakno++) {
209 struct perf_event *bp; 207 struct perf_event *bp;
210 struct arch_hw_breakpoint *info; 208 struct arch_hw_breakpoint *info;
211 int val; 209 int val;
@@ -292,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
292{ 290{
293 int i; 291 int i;
294 292
295 for (i = 0; i < 4; i++) 293 for (i = 0; i < HBP_NUM; i++)
296 if (breakinfo[i].addr == addr && breakinfo[i].enabled) 294 if (breakinfo[i].addr == addr && breakinfo[i].enabled)
297 break; 295 break;
298 if (i == 4) 296 if (i == HBP_NUM)
299 return -1; 297 return -1;
300 298
301 if (hw_break_release_slot(i)) { 299 if (hw_break_release_slot(i)) {
@@ -313,7 +311,7 @@ static void kgdb_remove_all_hw_break(void)
313 int cpu = raw_smp_processor_id(); 311 int cpu = raw_smp_processor_id();
314 struct perf_event *bp; 312 struct perf_event *bp;
315 313
316 for (i = 0; i < 4; i++) { 314 for (i = 0; i < HBP_NUM; i++) {
317 if (!breakinfo[i].enabled) 315 if (!breakinfo[i].enabled)
318 continue; 316 continue;
319 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
@@ -333,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
333{ 331{
334 int i; 332 int i;
335 333
336 for (i = 0; i < 4; i++) 334 for (i = 0; i < HBP_NUM; i++)
337 if (!breakinfo[i].enabled) 335 if (!breakinfo[i].enabled)
338 break; 336 break;
339 if (i == 4) 337 if (i == HBP_NUM)
340 return -1; 338 return -1;
341 339
342 switch (bptype) { 340 switch (bptype) {
@@ -397,7 +395,7 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
397 395
398 /* Disable hardware debugging while we are in kgdb: */ 396 /* Disable hardware debugging while we are in kgdb: */
399 set_debugreg(0UL, 7); 397 set_debugreg(0UL, 7);
400 for (i = 0; i < 4; i++) { 398 for (i = 0; i < HBP_NUM; i++) {
401 if (!breakinfo[i].enabled) 399 if (!breakinfo[i].enabled)
402 continue; 400 continue;
403 if (dbg_is_early) { 401 if (dbg_is_early) {
@@ -458,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
458{ 456{
459 unsigned long addr; 457 unsigned long addr;
460 char *ptr; 458 char *ptr;
461 int newPC;
462 459
463 switch (remcomInBuffer[0]) { 460 switch (remcomInBuffer[0]) {
464 case 'c': 461 case 'c':
@@ -469,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
469 linux_regs->ip = addr; 466 linux_regs->ip = addr;
470 case 'D': 467 case 'D':
471 case 'k': 468 case 'k':
472 newPC = linux_regs->ip;
473
474 /* clear the trace bit */ 469 /* clear the trace bit */
475 linux_regs->flags &= ~X86_EFLAGS_TF; 470 linux_regs->flags &= ~X86_EFLAGS_TF;
476 atomic_set(&kgdb_cpu_doing_single_step, -1); 471 atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -645,7 +640,7 @@ void kgdb_arch_late(void)
645 attr.bp_len = HW_BREAKPOINT_LEN_1; 640 attr.bp_len = HW_BREAKPOINT_LEN_1;
646 attr.bp_type = HW_BREAKPOINT_W; 641 attr.bp_type = HW_BREAKPOINT_W;
647 attr.disabled = 1; 642 attr.disabled = 1;
648 for (i = 0; i < 4; i++) { 643 for (i = 0; i < HBP_NUM; i++) {
649 if (breakinfo[i].pev) 644 if (breakinfo[i].pev)
650 continue; 645 continue;
651 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 675879b65ce6..770ebfb349e9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
126} 126}
127 127
128/* 128/*
129 * Check for the REX prefix which can only exist on X86_64 129 * Skip the prefixes of the instruction.
130 * X86_32 always returns 0
131 */ 130 */
132static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) 131static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
133{ 132{
133 insn_attr_t attr;
134
135 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
136 while (inat_is_legacy_prefix(attr)) {
137 insn++;
138 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
139 }
134#ifdef CONFIG_X86_64 140#ifdef CONFIG_X86_64
135 if ((*insn & 0xf0) == 0x40) 141 if (inat_is_rex_prefix(attr))
136 return 1; 142 insn++;
137#endif 143#endif
138 return 0; 144 return insn;
139} 145}
140 146
141/* 147/*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
272 */ 278 */
273static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) 279static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
274{ 280{
281 /* Skip prefixes */
282 insn = skip_prefixes(insn);
283
275 switch (*insn) { 284 switch (*insn) {
276 case 0xfa: /* cli */ 285 case 0xfa: /* cli */
277 case 0xfb: /* sti */ 286 case 0xfb: /* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
280 return 1; 289 return 1;
281 } 290 }
282 291
283 /*
284 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
285 * at the next byte instead.. but of course not recurse infinitely
286 */
287 if (is_REX_prefix(insn))
288 return is_IF_modifier(++insn);
289
290 return 0; 292 return 0;
291} 293}
292 294
@@ -707,6 +709,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
707 struct hlist_node *node, *tmp; 709 struct hlist_node *node, *tmp;
708 unsigned long flags, orig_ret_address = 0; 710 unsigned long flags, orig_ret_address = 0;
709 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 711 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
712 kprobe_opcode_t *correct_ret_addr = NULL;
710 713
711 INIT_HLIST_HEAD(&empty_rp); 714 INIT_HLIST_HEAD(&empty_rp);
712 kretprobe_hash_lock(current, &head, &flags); 715 kretprobe_hash_lock(current, &head, &flags);
@@ -738,14 +741,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
738 /* another task is sharing our hash bucket */ 741 /* another task is sharing our hash bucket */
739 continue; 742 continue;
740 743
744 orig_ret_address = (unsigned long)ri->ret_addr;
745
746 if (orig_ret_address != trampoline_address)
747 /*
748 * This is the real return address. Any other
749 * instances associated with this task are for
750 * other calls deeper on the call stack
751 */
752 break;
753 }
754
755 kretprobe_assert(ri, orig_ret_address, trampoline_address);
756
757 correct_ret_addr = ri->ret_addr;
758 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
759 if (ri->task != current)
760 /* another task is sharing our hash bucket */
761 continue;
762
763 orig_ret_address = (unsigned long)ri->ret_addr;
741 if (ri->rp && ri->rp->handler) { 764 if (ri->rp && ri->rp->handler) {
742 __get_cpu_var(current_kprobe) = &ri->rp->kp; 765 __get_cpu_var(current_kprobe) = &ri->rp->kp;
743 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; 766 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
767 ri->ret_addr = correct_ret_addr;
744 ri->rp->handler(ri, regs); 768 ri->rp->handler(ri, regs);
745 __get_cpu_var(current_kprobe) = NULL; 769 __get_cpu_var(current_kprobe) = NULL;
746 } 770 }
747 771
748 orig_ret_address = (unsigned long)ri->ret_addr;
749 recycle_rp_inst(ri, &empty_rp); 772 recycle_rp_inst(ri, &empty_rp);
750 773
751 if (orig_ret_address != trampoline_address) 774 if (orig_ret_address != trampoline_address)
@@ -757,8 +780,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
757 break; 780 break;
758 } 781 }
759 782
760 kretprobe_assert(ri, orig_ret_address, trampoline_address);
761
762 kretprobe_hash_unlock(current, &flags); 783 kretprobe_hash_unlock(current, &flags);
763 784
764 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 785 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
@@ -803,9 +824,8 @@ static void __kprobes resume_execution(struct kprobe *p,
803 unsigned long orig_ip = (unsigned long)p->addr; 824 unsigned long orig_ip = (unsigned long)p->addr;
804 kprobe_opcode_t *insn = p->ainsn.insn; 825 kprobe_opcode_t *insn = p->ainsn.insn;
805 826
806 /*skip the REX prefix*/ 827 /* Skip prefixes */
807 if (is_REX_prefix(insn)) 828 insn = skip_prefixes(insn);
808 insn++;
809 829
810 regs->flags &= ~X86_EFLAGS_TF; 830 regs->flags &= ~X86_EFLAGS_TF;
811 switch (*insn) { 831 switch (*insn) {
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e0bc186d7501..1c355c550960 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -239,11 +239,10 @@ int module_finalize(const Elf_Ehdr *hdr,
239 apply_paravirt(pseg, pseg + para->sh_size); 239 apply_paravirt(pseg, pseg + para->sh_size);
240 } 240 }
241 241
242 return module_bug_finalize(hdr, sechdrs, me); 242 return 0;
243} 243}
244 244
245void module_arch_cleanup(struct module *mod) 245void module_arch_cleanup(struct module *mod)
246{ 246{
247 alternatives_smp_module_del(mod); 247 alternatives_smp_module_del(mod);
248 module_bug_cleanup(mod);
249} 248}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d86dbf7e54be..d7b6f7fb4fec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
274 274
275void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } 275void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
276 276
277static void __init smp_register_lapic_address(unsigned long address)
278{
279 mp_lapic_addr = address;
280
281 set_fixmap_nocache(FIX_APIC_BASE, address);
282 if (boot_cpu_physical_apicid == -1U) {
283 boot_cpu_physical_apicid = read_apic_id();
284 apic_version[boot_cpu_physical_apicid] =
285 GET_APIC_VERSION(apic_read(APIC_LVR));
286 }
287}
288
277static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 289static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
278{ 290{
279 char str[16]; 291 char str[16];
@@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
295 if (early) 307 if (early)
296 return 1; 308 return 1;
297 309
310 /* Initialize the lapic mapping */
311 if (!acpi_lapic)
312 smp_register_lapic_address(mpc->lapic);
313
298 if (mpc->oemptr) 314 if (mpc->oemptr)
299 x86_init.mpparse.smp_read_mpc_oem(mpc); 315 x86_init.mpparse.smp_read_mpc_oem(mpc);
300 316
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 5915e0b33303..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
25#include <asm/i8259.h> 25#include <asm/i8259.h>
26#include <asm/apb_timer.h> 26#include <asm/apb_timer.h>
27 27
28/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
30 * cmdline option x86_mrst_timer can be used to override the configuration
31 * to prefer one or the other.
32 * at runtime, there are basically three timer configurations:
33 * 1. per cpu apbt clock only
34 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
35 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
36 *
37 * by default (without cmdline option), platform code first detects cpu type
38 * to see if we are on lincroft or penwell, then set up both lapic or apbt
39 * clocks accordingly.
40 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
41 * config #3 is supported but not recommended on medfield.
42 *
43 * rating and feature summary:
44 * lapic (with C3STOP) --------- 100
45 * apbt (always-on) ------------ 110
46 * lapic (always-on,ARAT) ------ 150
47 */
48
49__cpuinitdata enum mrst_timer_options mrst_timer_options;
50
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; 51static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; 52static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
53enum mrst_cpu_type __mrst_cpu_chip;
54EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
55
30int sfi_mtimer_num; 56int sfi_mtimer_num;
31 57
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; 58struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
167 return 0; 193 return 0;
168} 194}
169 195
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void) 196static unsigned long __init mrst_calibrate_tsc(void)
183{ 197{
184 unsigned long flags, fast_calibrate; 198 unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
195 209
196void __init mrst_time_init(void) 210void __init mrst_time_init(void)
197{ 211{
212 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY:
214 break;
215 case MRST_TIMER_LAPIC_APBT:
216 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
217 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
218 break;
219 default:
220 if (!boot_cpu_has(X86_FEATURE_ARAT))
221 break;
222 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
223 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
224 return;
225 }
226 /* we need at least one APB timer */
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); 227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0(); 228 pre_init_apic_IRQ0();
200 apbt_time_init(); 229 apbt_time_init();
@@ -205,16 +234,21 @@ void __init mrst_rtc_init(void)
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); 234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206} 235}
207 236
208/* 237void __cpuinit mrst_arch_setup(void)
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{ 238{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); 239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
215 if (disable_apbt_percpu) 240 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
216 setup_boot_APIC_clock(); 241 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
217}; 242 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
243 else {
244 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
245 boot_cpu_data.x86, boot_cpu_data.x86_model);
246 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
247 }
248 pr_debug("Moorestown CPU %s identified\n",
249 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
250 "Lincroft" : "Penwell");
251}
218 252
219/* MID systems don't have i8042 controller */ 253/* MID systems don't have i8042 controller */
220static int mrst_i8042_detect(void) 254static int mrst_i8042_detect(void)
@@ -232,11 +266,13 @@ void __init x86_mrst_early_setup(void)
232 x86_init.resources.reserve_resources = x86_init_noop; 266 x86_init.resources.reserve_resources = x86_init_noop;
233 267
234 x86_init.timers.timer_init = mrst_time_init; 268 x86_init.timers.timer_init = mrst_time_init;
235 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; 269 x86_init.timers.setup_percpu_clockev = x86_init_noop;
236 270
237 x86_init.irqs.pre_vector_init = x86_init_noop; 271 x86_init.irqs.pre_vector_init = x86_init_noop;
238 272
239 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; 273 x86_init.oem.arch_setup = mrst_arch_setup;
274
275 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
240 276
241 x86_platform.calibrate_tsc = mrst_calibrate_tsc; 277 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
242 x86_platform.i8042_detect = mrst_i8042_detect; 278 x86_platform.i8042_detect = mrst_i8042_detect;
@@ -250,3 +286,26 @@ void __init x86_mrst_early_setup(void)
250 x86_init.mpparse.get_smp_config = x86_init_uint_noop; 286 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
251 287
252} 288}
289
290/*
291 * if user does not want to use per CPU apb timer, just give it a lower rating
292 * than local apic timer and skip the late per cpu timer init.
293 */
294static inline int __init setup_x86_mrst_timer(char *arg)
295{
296 if (!arg)
297 return -EINVAL;
298
299 if (strcmp("apbt_only", arg) == 0)
300 mrst_timer_options = MRST_TIMER_APBT_ONLY;
301 else if (strcmp("lapic_and_apbt", arg) == 0)
302 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
303 else {
304 pr_warning("X86 MRST timer option %s not recognised"
305 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
306 arg);
307 return -EINVAL;
308 }
309 return 0;
310}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b3..0e0cdde519be 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
21#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h> 22#include <asm/setup.h>
23#include <asm/olpc.h> 23#include <asm/olpc.h>
24 24#include <asm/olpc_ofw.h>
25#ifdef CONFIG_OPEN_FIRMWARE
26#include <asm/ofw.h>
27#endif
28 25
29struct olpc_platform_t olpc_platform_info; 26struct olpc_platform_t olpc_platform_info;
30EXPORT_SYMBOL_GPL(olpc_platform_info); 27EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -145,7 +142,7 @@ restart:
145 * The OBF flag will sometimes misbehave due to what we believe 142 * The OBF flag will sometimes misbehave due to what we believe
146 * is a hardware quirk.. 143 * is a hardware quirk..
147 */ 144 */
148 printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); 145 pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
149 outb(cmd, 0x6c); 146 outb(cmd, 0x6c);
150 147
151 if (wait_on_ibf(0x6c, 0)) { 148 if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +159,7 @@ restart:
162 " EC accept data!\n"); 159 " EC accept data!\n");
163 goto err; 160 goto err;
164 } 161 }
165 printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", 162 pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
166 inbuf[i]);
167 outb(inbuf[i], 0x68); 163 outb(inbuf[i], 0x68);
168 } 164 }
169 } 165 }
@@ -176,8 +172,7 @@ restart:
176 goto restart; 172 goto restart;
177 } 173 }
178 outbuf[i] = inb(0x68); 174 outbuf[i] = inb(0x68);
179 printk(KERN_DEBUG "olpc-ec: received 0x%x\n", 175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
180 outbuf[i]);
181 } 176 }
182 } 177 }
183 178
@@ -188,14 +183,15 @@ err:
188} 183}
189EXPORT_SYMBOL_GPL(olpc_ec_cmd); 184EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190 185
191#ifdef CONFIG_OPEN_FIRMWARE 186#ifdef CONFIG_OLPC_OPENFIRMWARE
192static void __init platform_detect(void) 187static void __init platform_detect(void)
193{ 188{
194 size_t propsize; 189 size_t propsize;
195 __be32 rev; 190 __be32 rev;
191 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
192 void *res[] = { &propsize };
196 193
197 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 194 if (olpc_ofw("getprop", args, res) || propsize != 4) {
198 &propsize) || propsize != 4) {
199 printk(KERN_ERR "ofw: getprop call failed!\n"); 195 printk(KERN_ERR "ofw: getprop call failed!\n");
200 rev = cpu_to_be32(0); 196 rev = cpu_to_be32(0);
201 } 197 }
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 000000000000..3218aa71ab5e
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,106 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <asm/page.h>
5#include <asm/setup.h>
6#include <asm/io.h>
7#include <asm/pgtable.h>
8#include <asm/olpc_ofw.h>
9
10/* address of OFW callback interface; will be NULL if OFW isn't found */
11static int (*olpc_ofw_cif)(int *);
12
13/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
14u32 olpc_ofw_pgd __initdata;
15
16static DEFINE_SPINLOCK(ofw_lock);
17
18#define MAXARGS 10
19
20void __init setup_olpc_ofw_pgd(void)
21{
22 pgd_t *base, *ofw_pde;
23
24 if (!olpc_ofw_cif)
25 return;
26
27 /* fetch OFW's PDE */
28 base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
29 if (!base) {
30 printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
31 olpc_ofw_cif = NULL;
32 return;
33 }
34 ofw_pde = &base[OLPC_OFW_PDE_NR];
35
36 /* install OFW's PDE permanently into the kernel's pgtable */
37 set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
38 /* implicit optimization barrier here due to uninline function return */
39
40 early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
41}
42
43int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
44 void **res)
45{
46 int ofw_args[MAXARGS + 3];
47 unsigned long flags;
48 int ret, i, *p;
49
50 BUG_ON(nr_args + nr_res > MAXARGS);
51
52 if (!olpc_ofw_cif)
53 return -EIO;
54
55 ofw_args[0] = (int)name;
56 ofw_args[1] = nr_args;
57 ofw_args[2] = nr_res;
58
59 p = &ofw_args[3];
60 for (i = 0; i < nr_args; i++, p++)
61 *p = (int)args[i];
62
63 /* call into ofw */
64 spin_lock_irqsave(&ofw_lock, flags);
65 ret = olpc_ofw_cif(ofw_args);
66 spin_unlock_irqrestore(&ofw_lock, flags);
67
68 if (!ret) {
69 for (i = 0; i < nr_res; i++, p++)
70 *((int *)res[i]) = *p;
71 }
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(__olpc_ofw);
76
77/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000
79
80/* OFW starts on a 1MB boundary */
81#define OFW_BOUND (1<<20)
82
83void __init olpc_ofw_detect(void)
84{
85 struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
86 unsigned long start;
87
88 /* ensure OFW booted us by checking for "OFW " string */
89 if (hdr->ofw_magic != OLPC_OFW_SIG)
90 return;
91
92 olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
93
94 if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
95 printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
96 (unsigned long)olpc_ofw_cif);
97 olpc_ofw_cif = NULL;
98 return;
99 }
100
101 /* determine where OFW starts in memory */
102 start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
103 printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
104 (unsigned long)olpc_ofw_cif, (-start) >> 20);
105 reserve_top_address(-start);
106}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 4b7e3d8b01dd..9f07cfcbd3a5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -13,6 +13,7 @@
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14#include <asm/amd_iommu.h> 14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h> 15#include <asm/x86_init.h>
16#include <asm/xen/swiotlb-xen.h>
16 17
17static int forbid_dac __read_mostly; 18static int forbid_dac __read_mostly;
18 19
@@ -132,7 +133,7 @@ void __init pci_iommu_alloc(void)
132 /* free the range so iommu could get some range less than 4G */ 133 /* free the range so iommu could get some range less than 4G */
133 dma32_free_bootmem(); 134 dma32_free_bootmem();
134 135
135 if (pci_swiotlb_detect()) 136 if (pci_xen_swiotlb_detect() || pci_swiotlb_detect())
136 goto out; 137 goto out;
137 138
138 gart_iommu_hole_init(); 139 gart_iommu_hole_init();
@@ -144,6 +145,8 @@ void __init pci_iommu_alloc(void)
144 /* needs to be called after gart_iommu_hole_init */ 145 /* needs to be called after gart_iommu_hole_init */
145 amd_iommu_detect(); 146 amd_iommu_detect();
146out: 147out:
148 pci_xen_swiotlb_init();
149
147 pci_swiotlb_init(); 150 pci_swiotlb_init();
148} 151}
149 152
@@ -296,7 +299,7 @@ static int __init pci_iommu_init(void)
296#endif 299#endif
297 x86_init.iommu.iommu_init(); 300 x86_init.iommu.iommu_init();
298 301
299 if (swiotlb) { 302 if (swiotlb || xen_swiotlb) {
300 printk(KERN_INFO "PCI-DMA: " 303 printk(KERN_INFO "PCI-DMA: "
301 "Using software bounce buffering for IO (SWIOTLB)\n"); 304 "Using software bounce buffering for IO (SWIOTLB)\n");
302 swiotlb_print_info(); 305 swiotlb_print_info();
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cbcf013a0ec6..57d1868a86aa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -301,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread);
301/* 301/*
302 * sys_execve() executes a new program. 302 * sys_execve() executes a new program.
303 */ 303 */
304long sys_execve(char __user *name, char __user * __user *argv, 304long sys_execve(const char __user *name,
305 char __user * __user *envp, struct pt_regs *regs) 305 const char __user *const __user *argv,
306 const char __user *const __user *envp, struct pt_regs *regs)
306{ 307{
307 long error; 308 long error;
308 char *filename; 309 char *filename;
@@ -526,44 +527,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
526 return (edx & MWAIT_EDX_C1); 527 return (edx & MWAIT_EDX_C1);
527} 528}
528 529
529/* 530bool c1e_detected;
530 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. 531EXPORT_SYMBOL(c1e_detected);
531 * For more information see
532 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
533 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
534 */
535static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
536{
537 u64 val;
538 if (c->x86_vendor != X86_VENDOR_AMD)
539 goto no_c1e_idle;
540
541 /* Family 0x0f models < rev F do not have C1E */
542 if (c->x86 == 0x0F && c->x86_model >= 0x40)
543 return 1;
544
545 if (c->x86 == 0x10) {
546 /*
547 * check OSVW bit for CPUs that are not affected
548 * by erratum #400
549 */
550 if (cpu_has(c, X86_FEATURE_OSVW)) {
551 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
552 if (val >= 2) {
553 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
554 if (!(val & BIT(1)))
555 goto no_c1e_idle;
556 }
557 }
558 return 1;
559 }
560
561no_c1e_idle:
562 return 0;
563}
564 532
565static cpumask_var_t c1e_mask; 533static cpumask_var_t c1e_mask;
566static int c1e_detected;
567 534
568void c1e_remove_cpu(int cpu) 535void c1e_remove_cpu(int cpu)
569{ 536{
@@ -585,12 +552,12 @@ static void c1e_idle(void)
585 u32 lo, hi; 552 u32 lo, hi;
586 553
587 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 554 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
555
588 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 556 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
589 c1e_detected = 1; 557 c1e_detected = true;
590 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 558 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
591 mark_tsc_unstable("TSC halt in AMD C1E"); 559 mark_tsc_unstable("TSC halt in AMD C1E");
592 printk(KERN_INFO "System has AMD C1E enabled\n"); 560 printk(KERN_INFO "System has AMD C1E enabled\n");
593 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
594 } 561 }
595 } 562 }
596 563
@@ -639,7 +606,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
639 */ 606 */
640 printk(KERN_INFO "using mwait in idle threads.\n"); 607 printk(KERN_INFO "using mwait in idle threads.\n");
641 pm_idle = mwait_idle; 608 pm_idle = mwait_idle;
642 } else if (check_c1e_idle(c)) { 609 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
610 /* E400: APIC timer interrupt does not wake up CPU from C1e */
643 printk(KERN_INFO "using C1E aware idle routine\n"); 611 printk(KERN_INFO "using C1E aware idle routine\n");
644 pm_idle = c1e_idle; 612 pm_idle = c1e_idle;
645 } else 613 } else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af47..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,8 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 63
62/* 64/*
@@ -111,6 +113,8 @@ void cpu_idle(void)
111 stop_critical_timings(); 113 stop_critical_timings();
112 pm_idle(); 114 pm_idle();
113 start_critical_timings(); 115 start_critical_timings();
116
117 trace_power_end(smp_processor_id());
114 } 118 }
115 tick_nohz_restart_sched_tick(); 119 tick_nohz_restart_sched_tick();
116 preempt_enable_no_resched(); 120 preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3c2422a99f1f..3d9ea531ddd1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,6 +51,8 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
54asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
55 57
56DEFINE_PER_CPU(unsigned long, old_rsp); 58DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -138,6 +140,9 @@ void cpu_idle(void)
138 stop_critical_timings(); 140 stop_critical_timings();
139 pm_idle(); 141 pm_idle();
140 start_critical_timings(); 142 start_critical_timings();
143
144 trace_power_end(smp_processor_id());
145
141 /* In many cases the interrupt that ended idle 146 /* In many cases the interrupt that ended idle
142 has already called exit_idle. But some idle 147 has already called exit_idle. But some idle
143 loops can be woken up without interrupt. */ 148 loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4ae4acbd031..c3a4fbb2b996 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
102 102
103#include <asm/paravirt.h> 103#include <asm/paravirt.h>
104#include <asm/hypervisor.h> 104#include <asm/hypervisor.h>
105#include <asm/olpc_ofw.h>
105 106
106#include <asm/percpu.h> 107#include <asm/percpu.h>
107#include <asm/topology.h> 108#include <asm/topology.h>
@@ -736,10 +737,15 @@ void __init setup_arch(char **cmdline_p)
736 /* VMI may relocate the fixmap; do this before touching ioremap area */ 737 /* VMI may relocate the fixmap; do this before touching ioremap area */
737 vmi_init(); 738 vmi_init();
738 739
740 /* OFW also may relocate the fixmap */
741 olpc_ofw_detect();
742
739 early_trap_init(); 743 early_trap_init();
740 early_cpu_init(); 744 early_cpu_init();
741 early_ioremap_init(); 745 early_ioremap_init();
742 746
747 setup_olpc_ofw_pgd();
748
743 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 749 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
744 screen_info = boot_params.screen_info; 750 screen_info = boot_params.screen_info;
745 edid_info = boot_params.edid_info; 751 edid_info = boot_params.edid_info;
@@ -1008,6 +1014,8 @@ void __init setup_arch(char **cmdline_p)
1008 paging_init(); 1014 paging_init();
1009 x86_init.paging.pagetable_setup_done(swapper_pg_dir); 1015 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
1010 1016
1017 setup_trampoline_page_table();
1018
1011 tboot_probe(); 1019 tboot_probe();
1012 1020
1013#ifdef CONFIG_X86_64 1021#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c4f33b2e77d6..8b3bfc4dd708 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,7 +73,6 @@
73 73
74#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID]; 75u8 apicid_2_node[MAX_APICID];
76static int low_mappings;
77#endif 76#endif
78 77
79/* State of each CPU */ 78/* State of each CPU */
@@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
91static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); 90static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
92#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) 91#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
93#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) 92#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
93
94/*
95 * We need this for trampoline_base protection from concurrent accesses when
96 * off- and onlining cores wildly.
97 */
98static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
99
100void cpu_hotplug_driver_lock()
101{
102 mutex_lock(&x86_cpu_hotplug_driver_mutex);
103}
104
105void cpu_hotplug_driver_unlock()
106{
107 mutex_unlock(&x86_cpu_hotplug_driver_mutex);
108}
109
110ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
111ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
94#else 112#else
95static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 113static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
96#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 114#define get_idle_for_cpu(x) (idle_thread_array[(x)])
@@ -281,6 +299,18 @@ notrace static void __cpuinit start_secondary(void *unused)
281 * fragile that we want to limit the things done here to the 299 * fragile that we want to limit the things done here to the
282 * most necessary things. 300 * most necessary things.
283 */ 301 */
302
303#ifdef CONFIG_X86_32
304 /*
305 * Switch away from the trampoline page-table
306 *
307 * Do this before cpu_init() because it needs to access per-cpu
308 * data which may not be mapped in the trampoline page-table.
309 */
310 load_cr3(swapper_pg_dir);
311 __flush_tlb_all();
312#endif
313
284 vmi_bringup(); 314 vmi_bringup();
285 cpu_init(); 315 cpu_init();
286 preempt_disable(); 316 preempt_disable();
@@ -299,12 +329,6 @@ notrace static void __cpuinit start_secondary(void *unused)
299 legacy_pic->chip->unmask(0); 329 legacy_pic->chip->unmask(0);
300 } 330 }
301 331
302#ifdef CONFIG_X86_32
303 while (low_mappings)
304 cpu_relax();
305 __flush_tlb_all();
306#endif
307
308 /* This must be done before setting cpu_online_mask */ 332 /* This must be done before setting cpu_online_mask */
309 set_cpu_sibling_map(raw_smp_processor_id()); 333 set_cpu_sibling_map(raw_smp_processor_id());
310 wmb(); 334 wmb();
@@ -735,12 +759,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
735 goto do_rest; 759 goto do_rest;
736 } 760 }
737 761
738 if (!keventd_up() || current_is_keventd()) 762 schedule_work(&c_idle.work);
739 c_idle.work.func(&c_idle.work); 763 wait_for_completion(&c_idle.done);
740 else {
741 schedule_work(&c_idle.work);
742 wait_for_completion(&c_idle.done);
743 }
744 764
745 if (IS_ERR(c_idle.idle)) { 765 if (IS_ERR(c_idle.idle)) {
746 printk("failed fork for CPU %d\n", cpu); 766 printk("failed fork for CPU %d\n", cpu);
@@ -754,6 +774,7 @@ do_rest:
754#ifdef CONFIG_X86_32 774#ifdef CONFIG_X86_32
755 /* Stack for startup_32 can be just as for start_secondary onwards */ 775 /* Stack for startup_32 can be just as for start_secondary onwards */
756 irq_ctx_init(cpu); 776 irq_ctx_init(cpu);
777 initial_page_table = __pa(&trampoline_pg_dir);
757#else 778#else
758 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 779 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
759 initial_gs = per_cpu_offset(cpu); 780 initial_gs = per_cpu_offset(cpu);
@@ -816,6 +837,13 @@ do_rest:
816 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 837 if (cpumask_test_cpu(cpu, cpu_callin_mask))
817 break; /* It has booted */ 838 break; /* It has booted */
818 udelay(100); 839 udelay(100);
840 /*
841 * Allow other tasks to run while we wait for the
842 * AP to come online. This also gives a chance
843 * for the MTRR work(triggered by the AP coming online)
844 * to be completed in the stop machine context.
845 */
846 schedule();
819 } 847 }
820 848
821 if (cpumask_test_cpu(cpu, cpu_callin_mask)) 849 if (cpumask_test_cpu(cpu, cpu_callin_mask))
@@ -894,20 +922,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
894 922
895 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 923 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
896 924
897#ifdef CONFIG_X86_32
898 /* init low mem mapping */
899 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
900 min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
901 flush_tlb_all();
902 low_mappings = 1;
903
904 err = do_boot_cpu(apicid, cpu); 925 err = do_boot_cpu(apicid, cpu);
905 926
906 zap_low_mappings(false);
907 low_mappings = 0;
908#else
909 err = do_boot_cpu(apicid, cpu);
910#endif
911 if (err) { 927 if (err) {
912 pr_debug("do_boot_cpu failed %d\n", err); 928 pr_debug("do_boot_cpu failed %d\n", err);
913 return -EIO; 929 return -EIO;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
23 return 0; 23 return 0;
24} 24}
25 25
26static void save_stack_address(void *data, unsigned long addr, int reliable) 26static void
27__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
27{ 28{
28 struct stack_trace *trace = data; 29 struct stack_trace *trace = data;
30#ifdef CONFIG_FRAME_POINTER
29 if (!reliable) 31 if (!reliable)
30 return; 32 return;
33#endif
34 if (nosched && in_sched_functions(addr))
35 return;
31 if (trace->skip > 0) { 36 if (trace->skip > 0) {
32 trace->skip--; 37 trace->skip--;
33 return; 38 return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
36 trace->entries[trace->nr_entries++] = addr; 41 trace->entries[trace->nr_entries++] = addr;
37} 42}
38 43
44static void save_stack_address(void *data, unsigned long addr, int reliable)
45{
46 return __save_stack_address(data, addr, reliable, false);
47}
48
39static void 49static void
40save_stack_address_nosched(void *data, unsigned long addr, int reliable) 50save_stack_address_nosched(void *data, unsigned long addr, int reliable)
41{ 51{
42 struct stack_trace *trace = (struct stack_trace *)data; 52 return __save_stack_address(data, addr, reliable, true);
43 if (!reliable)
44 return;
45 if (in_sched_functions(addr))
46 return;
47 if (trace->skip > 0) {
48 trace->skip--;
49 return;
50 }
51 if (trace->nr_entries < trace->max_entries)
52 trace->entries[trace->nr_entries++] = addr;
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
96 96
97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ 97/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
98 98
99struct stack_frame { 99struct stack_frame_user {
100 const void __user *next_fp; 100 const void __user *next_fp;
101 unsigned long ret_addr; 101 unsigned long ret_addr;
102}; 102};
103 103
104static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 104static int
105copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
105{ 106{
106 int ret; 107 int ret;
107 108
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
126 trace->entries[trace->nr_entries++] = regs->ip; 127 trace->entries[trace->nr_entries++] = regs->ip;
127 128
128 while (trace->nr_entries < trace->max_entries) { 129 while (trace->nr_entries < trace->max_entries) {
129 struct stack_frame frame; 130 struct stack_frame_user frame;
130 131
131 frame.next_fp = NULL; 132 frame.next_fp = NULL;
132 frame.ret_addr = 0; 133 frame.ret_addr = 0;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 196552bb412c..d5e06624e34a 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -28,7 +28,9 @@
28 * Do a system call from kernel instead of calling sys_execve so we 28 * Do a system call from kernel instead of calling sys_execve so we
29 * end up with proper pt_regs. 29 * end up with proper pt_regs.
30 */ 30 */
31int kernel_execve(const char *filename, char *const argv[], char *const envp[]) 31int kernel_execve(const char *filename,
32 const char *const argv[],
33 const char *const envp[])
32{ 34{
33 long __res; 35 long __res;
34 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" 36 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..b35786dc9b8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_recvmmsg 339 .long sys_recvmmsg
340 .long sys_fanotify_init
341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 7fea555929e2..312ef0292815 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/debugfs.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
13 14
@@ -22,19 +23,37 @@
22#include <asm/irq_vectors.h> 23#include <asm/irq_vectors.h>
23#include <asm/timer.h> 24#include <asm/timer.h>
24 25
25struct msg_desc { 26/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
26 struct bau_payload_queue_entry *msg; 27static int timeout_base_ns[] = {
27 int msg_slot; 28 20,
28 int sw_ack_slot; 29 160,
29 struct bau_payload_queue_entry *va_queue_first; 30 1280,
30 struct bau_payload_queue_entry *va_queue_last; 31 10240,
32 81920,
33 655360,
34 5242880,
35 167772160
31}; 36};
32 37static int timeout_us;
33#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
34
35static int uv_bau_max_concurrent __read_mostly;
36
37static int nobau; 38static int nobau;
39static int baudisabled;
40static spinlock_t disable_lock;
41static cycles_t congested_cycles;
42
43/* tunables: */
44static int max_bau_concurrent = MAX_BAU_CONCURRENT;
45static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
46static int plugged_delay = PLUGGED_DELAY;
47static int plugsb4reset = PLUGSB4RESET;
48static int timeoutsb4reset = TIMEOUTSB4RESET;
49static int ipi_reset_limit = IPI_RESET_LIMIT;
50static int complete_threshold = COMPLETE_THRESHOLD;
51static int congested_response_us = CONGESTED_RESPONSE_US;
52static int congested_reps = CONGESTED_REPS;
53static int congested_period = CONGESTED_PERIOD;
54static struct dentry *tunables_dir;
55static struct dentry *tunables_file;
56
38static int __init setup_nobau(char *arg) 57static int __init setup_nobau(char *arg)
39{ 58{
40 nobau = 1; 59 nobau = 1;
@@ -52,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
52static DEFINE_PER_CPU(struct bau_control, bau_control); 71static DEFINE_PER_CPU(struct bau_control, bau_control);
53static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); 72static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
54 73
55struct reset_args {
56 int sender;
57};
58
59/* 74/*
60 * Determine the first node on a uvhub. 'Nodes' are used for kernel 75 * Determine the first node on a uvhub. 'Nodes' are used for kernel
61 * memory allocation. 76 * memory allocation.
@@ -126,7 +141,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
126 struct ptc_stats *stat; 141 struct ptc_stats *stat;
127 142
128 msg = mdp->msg; 143 msg = mdp->msg;
129 stat = &per_cpu(ptcstats, bcp->cpu); 144 stat = bcp->statp;
130 stat->d_retries++; 145 stat->d_retries++;
131 /* 146 /*
132 * cancel any message from msg+1 to the retry itself 147 * cancel any message from msg+1 to the retry itself
@@ -146,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
146 slot2 = msg2 - mdp->va_queue_first; 161 slot2 = msg2 - mdp->va_queue_first;
147 mmr = uv_read_local_mmr 162 mmr = uv_read_local_mmr
148 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 163 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
149 msg_res = ((msg2->sw_ack_vector << 8) | 164 msg_res = msg2->sw_ack_vector;
150 msg2->sw_ack_vector);
151 /* 165 /*
152 * This is a message retry; clear the resources held 166 * This is a message retry; clear the resources held
153 * by the previous message only if they timed out. 167 * by the previous message only if they timed out.
154 * If it has not timed out we have an unexpected 168 * If it has not timed out we have an unexpected
155 * situation to report. 169 * situation to report.
156 */ 170 */
157 if (mmr & (msg_res << 8)) { 171 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
158 /* 172 /*
159 * is the resource timed out? 173 * is the resource timed out?
160 * make everyone ignore the cancelled message. 174 * make everyone ignore the cancelled message.
@@ -164,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
164 cancel_count++; 178 cancel_count++;
165 uv_write_local_mmr( 179 uv_write_local_mmr(
166 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, 180 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
167 (msg_res << 8) | msg_res); 181 (msg_res << UV_SW_ACK_NPENDING) |
168 } else 182 msg_res);
169 printk(KERN_INFO "note bau retry: no effect\n"); 183 }
170 } 184 }
171 } 185 }
172 if (!cancel_count) 186 if (!cancel_count)
@@ -190,7 +204,7 @@ static void uv_bau_process_message(struct msg_desc *mdp,
190 * This must be a normal message, or retry of a normal message 204 * This must be a normal message, or retry of a normal message
191 */ 205 */
192 msg = mdp->msg; 206 msg = mdp->msg;
193 stat = &per_cpu(ptcstats, bcp->cpu); 207 stat = bcp->statp;
194 if (msg->address == TLB_FLUSH_ALL) { 208 if (msg->address == TLB_FLUSH_ALL) {
195 local_flush_tlb(); 209 local_flush_tlb();
196 stat->d_alltlb++; 210 stat->d_alltlb++;
@@ -274,7 +288,7 @@ uv_do_reset(void *ptr)
274 288
275 bcp = &per_cpu(bau_control, smp_processor_id()); 289 bcp = &per_cpu(bau_control, smp_processor_id());
276 rap = (struct reset_args *)ptr; 290 rap = (struct reset_args *)ptr;
277 stat = &per_cpu(ptcstats, bcp->cpu); 291 stat = bcp->statp;
278 stat->d_resets++; 292 stat->d_resets++;
279 293
280 /* 294 /*
@@ -302,13 +316,13 @@ uv_do_reset(void *ptr)
302 */ 316 */
303 mmr = uv_read_local_mmr 317 mmr = uv_read_local_mmr
304 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 318 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
305 msg_res = ((msg->sw_ack_vector << 8) | 319 msg_res = msg->sw_ack_vector;
306 msg->sw_ack_vector);
307 if (mmr & msg_res) { 320 if (mmr & msg_res) {
308 stat->d_rcanceled++; 321 stat->d_rcanceled++;
309 uv_write_local_mmr( 322 uv_write_local_mmr(
310 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, 323 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
311 msg_res); 324 (msg_res << UV_SW_ACK_NPENDING) |
325 msg_res);
312 } 326 }
313 } 327 }
314 } 328 }
@@ -386,17 +400,12 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
386 unsigned long mmr_offset, int right_shift, int this_cpu, 400 unsigned long mmr_offset, int right_shift, int this_cpu,
387 struct bau_control *bcp, struct bau_control *smaster, long try) 401 struct bau_control *bcp, struct bau_control *smaster, long try)
388{ 402{
389 int relaxes = 0;
390 unsigned long descriptor_status; 403 unsigned long descriptor_status;
391 unsigned long mmr;
392 unsigned long mask;
393 cycles_t ttime; 404 cycles_t ttime;
394 cycles_t timeout_time; 405 struct ptc_stats *stat = bcp->statp;
395 struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
396 struct bau_control *hmaster; 406 struct bau_control *hmaster;
397 407
398 hmaster = bcp->uvhub_master; 408 hmaster = bcp->uvhub_master;
399 timeout_time = get_cycles() + bcp->timeout_interval;
400 409
401 /* spin on the status MMR, waiting for it to go idle */ 410 /* spin on the status MMR, waiting for it to go idle */
402 while ((descriptor_status = (((unsigned long) 411 while ((descriptor_status = (((unsigned long)
@@ -423,7 +432,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
423 * pending. In that case hardware returns the 432 * pending. In that case hardware returns the
424 * ERROR that looks like a destination timeout. 433 * ERROR that looks like a destination timeout.
425 */ 434 */
426 if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { 435 if (cycles_2_us(ttime - bcp->send_message) <
436 timeout_us) {
427 bcp->conseccompletes = 0; 437 bcp->conseccompletes = 0;
428 return FLUSH_RETRY_PLUGGED; 438 return FLUSH_RETRY_PLUGGED;
429 } 439 }
@@ -435,26 +445,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
435 * descriptor_status is still BUSY 445 * descriptor_status is still BUSY
436 */ 446 */
437 cpu_relax(); 447 cpu_relax();
438 relaxes++;
439 if (relaxes >= 10000) {
440 relaxes = 0;
441 if (get_cycles() > timeout_time) {
442 quiesce_local_uvhub(hmaster);
443
444 /* single-thread the register change */
445 spin_lock(&hmaster->masks_lock);
446 mmr = uv_read_local_mmr(mmr_offset);
447 mask = 0UL;
448 mask |= (3UL < right_shift);
449 mask = ~mask;
450 mmr &= mask;
451 uv_write_local_mmr(mmr_offset, mmr);
452 spin_unlock(&hmaster->masks_lock);
453 end_uvhub_quiesce(hmaster);
454 stat->s_busy++;
455 return FLUSH_GIVEUP;
456 }
457 }
458 } 448 }
459 } 449 }
460 bcp->conseccompletes++; 450 bcp->conseccompletes++;
@@ -494,56 +484,116 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
494 return 1; 484 return 1;
495} 485}
496 486
487/*
488 * Our retries are blocked by all destination swack resources being
489 * in use, and a timeout is pending. In that case hardware immediately
490 * returns the ERROR that looks like a destination timeout.
491 */
492static void
493destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
494 struct bau_control *hmaster, struct ptc_stats *stat)
495{
496 udelay(bcp->plugged_delay);
497 bcp->plugged_tries++;
498 if (bcp->plugged_tries >= bcp->plugsb4reset) {
499 bcp->plugged_tries = 0;
500 quiesce_local_uvhub(hmaster);
501 spin_lock(&hmaster->queue_lock);
502 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
503 spin_unlock(&hmaster->queue_lock);
504 end_uvhub_quiesce(hmaster);
505 bcp->ipi_attempts++;
506 stat->s_resets_plug++;
507 }
508}
509
510static void
511destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
512 struct bau_control *hmaster, struct ptc_stats *stat)
513{
514 hmaster->max_bau_concurrent = 1;
515 bcp->timeout_tries++;
516 if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
517 bcp->timeout_tries = 0;
518 quiesce_local_uvhub(hmaster);
519 spin_lock(&hmaster->queue_lock);
520 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
521 spin_unlock(&hmaster->queue_lock);
522 end_uvhub_quiesce(hmaster);
523 bcp->ipi_attempts++;
524 stat->s_resets_timeout++;
525 }
526}
527
528/*
529 * Completions are taking a very long time due to a congested numalink
530 * network.
531 */
532static void
533disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
534{
535 int tcpu;
536 struct bau_control *tbcp;
537
538 /* let only one cpu do this disabling */
539 spin_lock(&disable_lock);
540 if (!baudisabled && bcp->period_requests &&
541 ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
542 /* it becomes this cpu's job to turn on the use of the
543 BAU again */
544 baudisabled = 1;
545 bcp->set_bau_off = 1;
546 bcp->set_bau_on_time = get_cycles() +
547 sec_2_cycles(bcp->congested_period);
548 stat->s_bau_disabled++;
549 for_each_present_cpu(tcpu) {
550 tbcp = &per_cpu(bau_control, tcpu);
551 tbcp->baudisabled = 1;
552 }
553 }
554 spin_unlock(&disable_lock);
555}
556
497/** 557/**
498 * uv_flush_send_and_wait 558 * uv_flush_send_and_wait
499 * 559 *
500 * Send a broadcast and wait for it to complete. 560 * Send a broadcast and wait for it to complete.
501 * 561 *
502 * The flush_mask contains the cpus the broadcast is to be sent to, plus 562 * The flush_mask contains the cpus the broadcast is to be sent to including
503 * cpus that are on the local uvhub. 563 * cpus that are on the local uvhub.
504 * 564 *
505 * Returns NULL if all flushing represented in the mask was done. The mask 565 * Returns 0 if all flushing represented in the mask was done.
506 * is zeroed. 566 * Returns 1 if it gives up entirely and the original cpu mask is to be
507 * Returns @flush_mask if some remote flushing remains to be done. The 567 * returned to the kernel.
508 * mask will have some bits still set, representing any cpus on the local
509 * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
510 */ 568 */
511const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, 569int uv_flush_send_and_wait(struct bau_desc *bau_desc,
512 struct cpumask *flush_mask, 570 struct cpumask *flush_mask, struct bau_control *bcp)
513 struct bau_control *bcp)
514{ 571{
515 int right_shift; 572 int right_shift;
516 int uvhub;
517 int bit;
518 int completion_status = 0; 573 int completion_status = 0;
519 int seq_number = 0; 574 int seq_number = 0;
520 long try = 0; 575 long try = 0;
521 int cpu = bcp->uvhub_cpu; 576 int cpu = bcp->uvhub_cpu;
522 int this_cpu = bcp->cpu; 577 int this_cpu = bcp->cpu;
523 int this_uvhub = bcp->uvhub;
524 unsigned long mmr_offset; 578 unsigned long mmr_offset;
525 unsigned long index; 579 unsigned long index;
526 cycles_t time1; 580 cycles_t time1;
527 cycles_t time2; 581 cycles_t time2;
528 struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); 582 cycles_t elapsed;
583 struct ptc_stats *stat = bcp->statp;
529 struct bau_control *smaster = bcp->socket_master; 584 struct bau_control *smaster = bcp->socket_master;
530 struct bau_control *hmaster = bcp->uvhub_master; 585 struct bau_control *hmaster = bcp->uvhub_master;
531 586
532 /*
533 * Spin here while there are hmaster->max_concurrent or more active
534 * descriptors. This is the per-uvhub 'throttle'.
535 */
536 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 587 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
537 &hmaster->active_descriptor_count, 588 &hmaster->active_descriptor_count,
538 hmaster->max_concurrent)) { 589 hmaster->max_bau_concurrent)) {
539 stat->s_throttles++; 590 stat->s_throttles++;
540 do { 591 do {
541 cpu_relax(); 592 cpu_relax();
542 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 593 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
543 &hmaster->active_descriptor_count, 594 &hmaster->active_descriptor_count,
544 hmaster->max_concurrent)); 595 hmaster->max_bau_concurrent));
545 } 596 }
546
547 while (hmaster->uvhub_quiesce) 597 while (hmaster->uvhub_quiesce)
548 cpu_relax(); 598 cpu_relax();
549 599
@@ -557,23 +607,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
557 } 607 }
558 time1 = get_cycles(); 608 time1 = get_cycles();
559 do { 609 do {
560 /*
561 * Every message from any given cpu gets a unique message
562 * sequence number. But retries use that same number.
563 * Our message may have timed out at the destination because
564 * all sw-ack resources are in use and there is a timeout
565 * pending there. In that case, our last send never got
566 * placed into the queue and we need to persist until it
567 * does.
568 *
569 * Make any retry a type MSG_RETRY so that the destination will
570 * free any resource held by a previous message from this cpu.
571 */
572 if (try == 0) { 610 if (try == 0) {
573 /* use message type set by the caller the first time */ 611 bau_desc->header.msg_type = MSG_REGULAR;
574 seq_number = bcp->message_number++; 612 seq_number = bcp->message_number++;
575 } else { 613 } else {
576 /* use RETRY type on all the rest; same sequence */
577 bau_desc->header.msg_type = MSG_RETRY; 614 bau_desc->header.msg_type = MSG_RETRY;
578 stat->s_retry_messages++; 615 stat->s_retry_messages++;
579 } 616 }
@@ -581,50 +618,17 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
581 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 618 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
582 bcp->uvhub_cpu; 619 bcp->uvhub_cpu;
583 bcp->send_message = get_cycles(); 620 bcp->send_message = get_cycles();
584
585 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 621 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
586
587 try++; 622 try++;
588 completion_status = uv_wait_completion(bau_desc, mmr_offset, 623 completion_status = uv_wait_completion(bau_desc, mmr_offset,
589 right_shift, this_cpu, bcp, smaster, try); 624 right_shift, this_cpu, bcp, smaster, try);
590 625
591 if (completion_status == FLUSH_RETRY_PLUGGED) { 626 if (completion_status == FLUSH_RETRY_PLUGGED) {
592 /* 627 destination_plugged(bau_desc, bcp, hmaster, stat);
593 * Our retries may be blocked by all destination swack
594 * resources being consumed, and a timeout pending. In
595 * that case hardware immediately returns the ERROR
596 * that looks like a destination timeout.
597 */
598 udelay(TIMEOUT_DELAY);
599 bcp->plugged_tries++;
600 if (bcp->plugged_tries >= PLUGSB4RESET) {
601 bcp->plugged_tries = 0;
602 quiesce_local_uvhub(hmaster);
603 spin_lock(&hmaster->queue_lock);
604 uv_reset_with_ipi(&bau_desc->distribution,
605 this_cpu);
606 spin_unlock(&hmaster->queue_lock);
607 end_uvhub_quiesce(hmaster);
608 bcp->ipi_attempts++;
609 stat->s_resets_plug++;
610 }
611 } else if (completion_status == FLUSH_RETRY_TIMEOUT) { 628 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
612 hmaster->max_concurrent = 1; 629 destination_timeout(bau_desc, bcp, hmaster, stat);
613 bcp->timeout_tries++;
614 udelay(TIMEOUT_DELAY);
615 if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
616 bcp->timeout_tries = 0;
617 quiesce_local_uvhub(hmaster);
618 spin_lock(&hmaster->queue_lock);
619 uv_reset_with_ipi(&bau_desc->distribution,
620 this_cpu);
621 spin_unlock(&hmaster->queue_lock);
622 end_uvhub_quiesce(hmaster);
623 bcp->ipi_attempts++;
624 stat->s_resets_timeout++;
625 }
626 } 630 }
627 if (bcp->ipi_attempts >= 3) { 631 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
628 bcp->ipi_attempts = 0; 632 bcp->ipi_attempts = 0;
629 completion_status = FLUSH_GIVEUP; 633 completion_status = FLUSH_GIVEUP;
630 break; 634 break;
@@ -633,49 +637,36 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
633 } while ((completion_status == FLUSH_RETRY_PLUGGED) || 637 } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
634 (completion_status == FLUSH_RETRY_TIMEOUT)); 638 (completion_status == FLUSH_RETRY_TIMEOUT));
635 time2 = get_cycles(); 639 time2 = get_cycles();
636 640 bcp->plugged_tries = 0;
637 if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) 641 bcp->timeout_tries = 0;
638 && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) 642 if ((completion_status == FLUSH_COMPLETE) &&
639 hmaster->max_concurrent++; 643 (bcp->conseccompletes > bcp->complete_threshold) &&
640 644 (hmaster->max_bau_concurrent <
641 /* 645 hmaster->max_bau_concurrent_constant))
642 * hold any cpu not timing out here; no other cpu currently held by 646 hmaster->max_bau_concurrent++;
643 * the 'throttle' should enter the activation code
644 */
645 while (hmaster->uvhub_quiesce) 647 while (hmaster->uvhub_quiesce)
646 cpu_relax(); 648 cpu_relax();
647 atomic_dec(&hmaster->active_descriptor_count); 649 atomic_dec(&hmaster->active_descriptor_count);
648 650 if (time2 > time1) {
649 /* guard against cycles wrap */ 651 elapsed = time2 - time1;
650 if (time2 > time1) 652 stat->s_time += elapsed;
651 stat->s_time += (time2 - time1); 653 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
652 else 654 bcp->period_requests++;
653 stat->s_requestor--; /* don't count this one */ 655 bcp->period_time += elapsed;
656 if ((elapsed > congested_cycles) &&
657 (bcp->period_requests > bcp->congested_reps)) {
658 disable_for_congestion(bcp, stat);
659 }
660 }
661 } else
662 stat->s_requestor--;
654 if (completion_status == FLUSH_COMPLETE && try > 1) 663 if (completion_status == FLUSH_COMPLETE && try > 1)
655 stat->s_retriesok++; 664 stat->s_retriesok++;
656 else if (completion_status == FLUSH_GIVEUP) { 665 else if (completion_status == FLUSH_GIVEUP) {
657 /*
658 * Cause the caller to do an IPI-style TLB shootdown on
659 * the target cpu's, all of which are still in the mask.
660 */
661 stat->s_giveup++; 666 stat->s_giveup++;
662 return flush_mask; 667 return 1;
663 }
664
665 /*
666 * Success, so clear the remote cpu's from the mask so we don't
667 * use the IPI method of shootdown on them.
668 */
669 for_each_cpu(bit, flush_mask) {
670 uvhub = uv_cpu_to_blade_id(bit);
671 if (uvhub == this_uvhub)
672 continue;
673 cpumask_clear_cpu(bit, flush_mask);
674 } 668 }
675 if (!cpumask_empty(flush_mask)) 669 return 0;
676 return flush_mask;
677
678 return NULL;
679} 670}
680 671
681/** 672/**
@@ -707,70 +698,89 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
707 struct mm_struct *mm, 698 struct mm_struct *mm,
708 unsigned long va, unsigned int cpu) 699 unsigned long va, unsigned int cpu)
709{ 700{
710 int remotes;
711 int tcpu; 701 int tcpu;
712 int uvhub; 702 int uvhub;
713 int locals = 0; 703 int locals = 0;
704 int remotes = 0;
705 int hubs = 0;
714 struct bau_desc *bau_desc; 706 struct bau_desc *bau_desc;
715 struct cpumask *flush_mask; 707 struct cpumask *flush_mask;
716 struct ptc_stats *stat; 708 struct ptc_stats *stat;
717 struct bau_control *bcp; 709 struct bau_control *bcp;
710 struct bau_control *tbcp;
718 711
712 /* kernel was booted 'nobau' */
719 if (nobau) 713 if (nobau)
720 return cpumask; 714 return cpumask;
721 715
722 bcp = &per_cpu(bau_control, cpu); 716 bcp = &per_cpu(bau_control, cpu);
717 stat = bcp->statp;
718
719 /* bau was disabled due to slow response */
720 if (bcp->baudisabled) {
721 /* the cpu that disabled it must re-enable it */
722 if (bcp->set_bau_off) {
723 if (get_cycles() >= bcp->set_bau_on_time) {
724 stat->s_bau_reenabled++;
725 baudisabled = 0;
726 for_each_present_cpu(tcpu) {
727 tbcp = &per_cpu(bau_control, tcpu);
728 tbcp->baudisabled = 0;
729 tbcp->period_requests = 0;
730 tbcp->period_time = 0;
731 }
732 }
733 }
734 return cpumask;
735 }
736
723 /* 737 /*
724 * Each sending cpu has a per-cpu mask which it fills from the caller's 738 * Each sending cpu has a per-cpu mask which it fills from the caller's
725 * cpu mask. Only remote cpus are converted to uvhubs and copied. 739 * cpu mask. All cpus are converted to uvhubs and copied to the
740 * activation descriptor.
726 */ 741 */
727 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 742 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
728 /* 743 /* don't actually do a shootdown of the local cpu */
729 * copy cpumask to flush_mask, removing current cpu
730 * (current cpu should already have been flushed by the caller and
731 * should never be returned if we return flush_mask)
732 */
733 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 744 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
734 if (cpu_isset(cpu, *cpumask)) 745 if (cpu_isset(cpu, *cpumask))
735 locals++; /* current cpu was targeted */ 746 stat->s_ntargself++;
736 747
737 bau_desc = bcp->descriptor_base; 748 bau_desc = bcp->descriptor_base;
738 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 749 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
739
740 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 750 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
741 remotes = 0; 751
752 /* cpu statistics */
742 for_each_cpu(tcpu, flush_mask) { 753 for_each_cpu(tcpu, flush_mask) {
743 uvhub = uv_cpu_to_blade_id(tcpu); 754 uvhub = uv_cpu_to_blade_id(tcpu);
744 if (uvhub == bcp->uvhub) {
745 locals++;
746 continue;
747 }
748 bau_uvhub_set(uvhub, &bau_desc->distribution); 755 bau_uvhub_set(uvhub, &bau_desc->distribution);
749 remotes++; 756 if (uvhub == bcp->uvhub)
750 } 757 locals++;
751 if (remotes == 0) {
752 /*
753 * No off_hub flushing; return status for local hub.
754 * Return the caller's mask if all were local (the current
755 * cpu may be in that mask).
756 */
757 if (locals)
758 return cpumask;
759 else 758 else
760 return NULL; 759 remotes++;
761 } 760 }
762 stat = &per_cpu(ptcstats, cpu); 761 if ((locals + remotes) == 0)
762 return NULL;
763 stat->s_requestor++; 763 stat->s_requestor++;
764 stat->s_ntargcpu += remotes; 764 stat->s_ntargcpu += remotes + locals;
765 stat->s_ntargremotes += remotes;
766 stat->s_ntarglocals += locals;
765 remotes = bau_uvhub_weight(&bau_desc->distribution); 767 remotes = bau_uvhub_weight(&bau_desc->distribution);
766 stat->s_ntarguvhub += remotes; 768
767 if (remotes >= 16) 769 /* uvhub statistics */
770 hubs = bau_uvhub_weight(&bau_desc->distribution);
771 if (locals) {
772 stat->s_ntarglocaluvhub++;
773 stat->s_ntargremoteuvhub += (hubs - 1);
774 } else
775 stat->s_ntargremoteuvhub += hubs;
776 stat->s_ntarguvhub += hubs;
777 if (hubs >= 16)
768 stat->s_ntarguvhub16++; 778 stat->s_ntarguvhub16++;
769 else if (remotes >= 8) 779 else if (hubs >= 8)
770 stat->s_ntarguvhub8++; 780 stat->s_ntarguvhub8++;
771 else if (remotes >= 4) 781 else if (hubs >= 4)
772 stat->s_ntarguvhub4++; 782 stat->s_ntarguvhub4++;
773 else if (remotes >= 2) 783 else if (hubs >= 2)
774 stat->s_ntarguvhub2++; 784 stat->s_ntarguvhub2++;
775 else 785 else
776 stat->s_ntarguvhub1++; 786 stat->s_ntarguvhub1++;
@@ -779,10 +789,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
779 bau_desc->payload.sending_cpu = cpu; 789 bau_desc->payload.sending_cpu = cpu;
780 790
781 /* 791 /*
782 * uv_flush_send_and_wait returns null if all cpu's were messaged, or 792 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
783 * the adjusted flush_mask if any cpu's were not messaged. 793 * or 1 if it gave up and the original cpumask should be returned.
784 */ 794 */
785 return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); 795 if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
796 return NULL;
797 else
798 return cpumask;
786} 799}
787 800
788/* 801/*
@@ -810,7 +823,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
810 823
811 time_start = get_cycles(); 824 time_start = get_cycles();
812 bcp = &per_cpu(bau_control, smp_processor_id()); 825 bcp = &per_cpu(bau_control, smp_processor_id());
813 stat = &per_cpu(ptcstats, smp_processor_id()); 826 stat = bcp->statp;
814 msgdesc.va_queue_first = bcp->va_queue_first; 827 msgdesc.va_queue_first = bcp->va_queue_first;
815 msgdesc.va_queue_last = bcp->va_queue_last; 828 msgdesc.va_queue_last = bcp->va_queue_last;
816 msg = bcp->bau_msg_head; 829 msg = bcp->bau_msg_head;
@@ -908,12 +921,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
908} 921}
909 922
910static inline unsigned long long 923static inline unsigned long long
911millisec_2_cycles(unsigned long millisec) 924microsec_2_cycles(unsigned long microsec)
912{ 925{
913 unsigned long ns; 926 unsigned long ns;
914 unsigned long long cyc; 927 unsigned long long cyc;
915 928
916 ns = millisec * 1000; 929 ns = microsec * 1000;
917 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 930 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
918 return cyc; 931 return cyc;
919} 932}
@@ -931,15 +944,19 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
931 944
932 if (!cpu) { 945 if (!cpu) {
933 seq_printf(file, 946 seq_printf(file,
934 "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); 947 "# cpu sent stime self locals remotes ncpus localhub ");
948 seq_printf(file,
949 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
935 seq_printf(file, 950 seq_printf(file,
936 "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); 951 "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
937 seq_printf(file, 952 seq_printf(file,
938 "retries rok resetp resett giveup sto bz throt "); 953 "retries rok resetp resett giveup sto bz throt ");
939 seq_printf(file, 954 seq_printf(file,
940 "sw_ack recv rtime all "); 955 "sw_ack recv rtime all ");
941 seq_printf(file, 956 seq_printf(file,
942 "one mult none retry canc nocan reset rcan\n"); 957 "one mult none retry canc nocan reset rcan ");
958 seq_printf(file,
959 "disable enable\n");
943 } 960 }
944 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 961 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
945 stat = &per_cpu(ptcstats, cpu); 962 stat = &per_cpu(ptcstats, cpu);
@@ -947,18 +964,23 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
947 seq_printf(file, 964 seq_printf(file,
948 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 965 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
949 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 966 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
950 stat->s_ntarguvhub, stat->s_ntarguvhub16, 967 stat->s_ntargself, stat->s_ntarglocals,
968 stat->s_ntargremotes, stat->s_ntargcpu,
969 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
970 stat->s_ntarguvhub, stat->s_ntarguvhub16);
971 seq_printf(file, "%ld %ld %ld %ld %ld ",
951 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 972 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
952 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 973 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
953 stat->s_ntargcpu, stat->s_dtimeout); 974 stat->s_dtimeout);
954 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 975 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
955 stat->s_retry_messages, stat->s_retriesok, 976 stat->s_retry_messages, stat->s_retriesok,
956 stat->s_resets_plug, stat->s_resets_timeout, 977 stat->s_resets_plug, stat->s_resets_timeout,
957 stat->s_giveup, stat->s_stimeout, 978 stat->s_giveup, stat->s_stimeout,
958 stat->s_busy, stat->s_throttles); 979 stat->s_busy, stat->s_throttles);
980
959 /* destination side statistics */ 981 /* destination side statistics */
960 seq_printf(file, 982 seq_printf(file,
961 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", 983 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
962 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 984 uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
963 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 985 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
964 stat->d_requestee, cycles_2_us(stat->d_time), 986 stat->d_requestee, cycles_2_us(stat->d_time),
@@ -966,15 +988,36 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
966 stat->d_nomsg, stat->d_retries, stat->d_canceled, 988 stat->d_nomsg, stat->d_retries, stat->d_canceled,
967 stat->d_nocanceled, stat->d_resets, 989 stat->d_nocanceled, stat->d_resets,
968 stat->d_rcanceled); 990 stat->d_rcanceled);
991 seq_printf(file, "%ld %ld\n",
992 stat->s_bau_disabled, stat->s_bau_reenabled);
969 } 993 }
970 994
971 return 0; 995 return 0;
972} 996}
973 997
974/* 998/*
999 * Display the tunables thru debugfs
1000 */
1001static ssize_t tunables_read(struct file *file, char __user *userbuf,
1002 size_t count, loff_t *ppos)
1003{
1004 char buf[300];
1005 int ret;
1006
1007 ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1008 "max_bau_concurrent plugged_delay plugsb4reset",
1009 "timeoutsb4reset ipi_reset_limit complete_threshold",
1010 "congested_response_us congested_reps congested_period",
1011 max_bau_concurrent, plugged_delay, plugsb4reset,
1012 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1013 congested_response_us, congested_reps, congested_period);
1014
1015 return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
1016}
1017
1018/*
975 * -1: resetf the statistics 1019 * -1: resetf the statistics
976 * 0: display meaning of the statistics 1020 * 0: display meaning of the statistics
977 * >0: maximum concurrent active descriptors per uvhub (throttle)
978 */ 1021 */
979static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 1022static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
980 size_t count, loff_t *data) 1023 size_t count, loff_t *data)
@@ -983,7 +1026,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
983 long input_arg; 1026 long input_arg;
984 char optstr[64]; 1027 char optstr[64];
985 struct ptc_stats *stat; 1028 struct ptc_stats *stat;
986 struct bau_control *bcp;
987 1029
988 if (count == 0 || count > sizeof(optstr)) 1030 if (count == 0 || count > sizeof(optstr))
989 return -EINVAL; 1031 return -EINVAL;
@@ -1059,29 +1101,158 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
1059 "reset: number of ipi-style reset requests processed\n"); 1101 "reset: number of ipi-style reset requests processed\n");
1060 printk(KERN_DEBUG 1102 printk(KERN_DEBUG
1061 "rcan: number messages canceled by reset requests\n"); 1103 "rcan: number messages canceled by reset requests\n");
1104 printk(KERN_DEBUG
1105 "disable: number times use of the BAU was disabled\n");
1106 printk(KERN_DEBUG
1107 "enable: number times use of the BAU was re-enabled\n");
1062 } else if (input_arg == -1) { 1108 } else if (input_arg == -1) {
1063 for_each_present_cpu(cpu) { 1109 for_each_present_cpu(cpu) {
1064 stat = &per_cpu(ptcstats, cpu); 1110 stat = &per_cpu(ptcstats, cpu);
1065 memset(stat, 0, sizeof(struct ptc_stats)); 1111 memset(stat, 0, sizeof(struct ptc_stats));
1066 } 1112 }
1067 } else { 1113 }
1068 uv_bau_max_concurrent = input_arg; 1114
1069 bcp = &per_cpu(bau_control, smp_processor_id()); 1115 return count;
1070 if (uv_bau_max_concurrent < 1 || 1116}
1071 uv_bau_max_concurrent > bcp->cpus_in_uvhub) { 1117
1072 printk(KERN_DEBUG 1118static int local_atoi(const char *name)
1073 "Error: BAU max concurrent %d; %d is invalid\n", 1119{
1074 bcp->max_concurrent, uv_bau_max_concurrent); 1120 int val = 0;
1075 return -EINVAL; 1121
1076 } 1122 for (;; name++) {
1077 printk(KERN_DEBUG "Set BAU max concurrent:%d\n", 1123 switch (*name) {
1078 uv_bau_max_concurrent); 1124 case '0' ... '9':
1079 for_each_present_cpu(cpu) { 1125 val = 10*val+(*name-'0');
1080 bcp = &per_cpu(bau_control, cpu); 1126 break;
1081 bcp->max_concurrent = uv_bau_max_concurrent; 1127 default:
1128 return val;
1082 } 1129 }
1083 } 1130 }
1131}
1132
1133/*
1134 * set the tunables
1135 * 0 values reset them to defaults
1136 */
1137static ssize_t tunables_write(struct file *file, const char __user *user,
1138 size_t count, loff_t *data)
1139{
1140 int cpu;
1141 int cnt = 0;
1142 int val;
1143 char *p;
1144 char *q;
1145 char instr[64];
1146 struct bau_control *bcp;
1147
1148 if (count == 0 || count > sizeof(instr)-1)
1149 return -EINVAL;
1150 if (copy_from_user(instr, user, count))
1151 return -EFAULT;
1084 1152
1153 instr[count] = '\0';
1154 /* count the fields */
1155 p = instr + strspn(instr, WHITESPACE);
1156 q = p;
1157 for (; *p; p = q + strspn(q, WHITESPACE)) {
1158 q = p + strcspn(p, WHITESPACE);
1159 cnt++;
1160 if (q == p)
1161 break;
1162 }
1163 if (cnt != 9) {
1164 printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
1165 return -EINVAL;
1166 }
1167
1168 p = instr + strspn(instr, WHITESPACE);
1169 q = p;
1170 for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1171 q = p + strcspn(p, WHITESPACE);
1172 val = local_atoi(p);
1173 switch (cnt) {
1174 case 0:
1175 if (val == 0) {
1176 max_bau_concurrent = MAX_BAU_CONCURRENT;
1177 max_bau_concurrent_constant =
1178 MAX_BAU_CONCURRENT;
1179 continue;
1180 }
1181 bcp = &per_cpu(bau_control, smp_processor_id());
1182 if (val < 1 || val > bcp->cpus_in_uvhub) {
1183 printk(KERN_DEBUG
1184 "Error: BAU max concurrent %d is invalid\n",
1185 val);
1186 return -EINVAL;
1187 }
1188 max_bau_concurrent = val;
1189 max_bau_concurrent_constant = val;
1190 continue;
1191 case 1:
1192 if (val == 0)
1193 plugged_delay = PLUGGED_DELAY;
1194 else
1195 plugged_delay = val;
1196 continue;
1197 case 2:
1198 if (val == 0)
1199 plugsb4reset = PLUGSB4RESET;
1200 else
1201 plugsb4reset = val;
1202 continue;
1203 case 3:
1204 if (val == 0)
1205 timeoutsb4reset = TIMEOUTSB4RESET;
1206 else
1207 timeoutsb4reset = val;
1208 continue;
1209 case 4:
1210 if (val == 0)
1211 ipi_reset_limit = IPI_RESET_LIMIT;
1212 else
1213 ipi_reset_limit = val;
1214 continue;
1215 case 5:
1216 if (val == 0)
1217 complete_threshold = COMPLETE_THRESHOLD;
1218 else
1219 complete_threshold = val;
1220 continue;
1221 case 6:
1222 if (val == 0)
1223 congested_response_us = CONGESTED_RESPONSE_US;
1224 else
1225 congested_response_us = val;
1226 continue;
1227 case 7:
1228 if (val == 0)
1229 congested_reps = CONGESTED_REPS;
1230 else
1231 congested_reps = val;
1232 continue;
1233 case 8:
1234 if (val == 0)
1235 congested_period = CONGESTED_PERIOD;
1236 else
1237 congested_period = val;
1238 continue;
1239 }
1240 if (q == p)
1241 break;
1242 }
1243 for_each_present_cpu(cpu) {
1244 bcp = &per_cpu(bau_control, cpu);
1245 bcp->max_bau_concurrent = max_bau_concurrent;
1246 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1247 bcp->plugged_delay = plugged_delay;
1248 bcp->plugsb4reset = plugsb4reset;
1249 bcp->timeoutsb4reset = timeoutsb4reset;
1250 bcp->ipi_reset_limit = ipi_reset_limit;
1251 bcp->complete_threshold = complete_threshold;
1252 bcp->congested_response_us = congested_response_us;
1253 bcp->congested_reps = congested_reps;
1254 bcp->congested_period = congested_period;
1255 }
1085 return count; 1256 return count;
1086} 1257}
1087 1258
@@ -1097,6 +1268,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file)
1097 return seq_open(file, &uv_ptc_seq_ops); 1268 return seq_open(file, &uv_ptc_seq_ops);
1098} 1269}
1099 1270
1271static int tunables_open(struct inode *inode, struct file *file)
1272{
1273 return 0;
1274}
1275
1100static const struct file_operations proc_uv_ptc_operations = { 1276static const struct file_operations proc_uv_ptc_operations = {
1101 .open = uv_ptc_proc_open, 1277 .open = uv_ptc_proc_open,
1102 .read = seq_read, 1278 .read = seq_read,
@@ -1105,6 +1281,12 @@ static const struct file_operations proc_uv_ptc_operations = {
1105 .release = seq_release, 1281 .release = seq_release,
1106}; 1282};
1107 1283
1284static const struct file_operations tunables_fops = {
1285 .open = tunables_open,
1286 .read = tunables_read,
1287 .write = tunables_write,
1288};
1289
1108static int __init uv_ptc_init(void) 1290static int __init uv_ptc_init(void)
1109{ 1291{
1110 struct proc_dir_entry *proc_uv_ptc; 1292 struct proc_dir_entry *proc_uv_ptc;
@@ -1119,6 +1301,20 @@ static int __init uv_ptc_init(void)
1119 UV_PTC_BASENAME); 1301 UV_PTC_BASENAME);
1120 return -EINVAL; 1302 return -EINVAL;
1121 } 1303 }
1304
1305 tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1306 if (!tunables_dir) {
1307 printk(KERN_ERR "unable to create debugfs directory %s\n",
1308 UV_BAU_TUNABLES_DIR);
1309 return -EINVAL;
1310 }
1311 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1312 tunables_dir, NULL, &tunables_fops);
1313 if (!tunables_file) {
1314 printk(KERN_ERR "unable to create debugfs file %s\n",
1315 UV_BAU_TUNABLES_FILE);
1316 return -EINVAL;
1317 }
1122 return 0; 1318 return 0;
1123} 1319}
1124 1320
@@ -1259,15 +1455,45 @@ static void __init uv_init_uvhub(int uvhub, int vector)
1259} 1455}
1260 1456
1261/* 1457/*
1458 * We will set BAU_MISC_CONTROL with a timeout period.
1459 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1460 * So the destination timeout period has be be calculated from them.
1461 */
1462static int
1463calculate_destination_timeout(void)
1464{
1465 unsigned long mmr_image;
1466 int mult1;
1467 int mult2;
1468 int index;
1469 int base;
1470 int ret;
1471 unsigned long ts_ns;
1472
1473 mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1474 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1475 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1476 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1477 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1478 base = timeout_base_ns[index];
1479 ts_ns = base * mult1 * mult2;
1480 ret = ts_ns / 1000;
1481 return ret;
1482}
1483
1484/*
1262 * initialize the bau_control structure for each cpu 1485 * initialize the bau_control structure for each cpu
1263 */ 1486 */
1264static void uv_init_per_cpu(int nuvhubs) 1487static void __init uv_init_per_cpu(int nuvhubs)
1265{ 1488{
1266 int i, j, k; 1489 int i;
1267 int cpu; 1490 int cpu;
1268 int pnode; 1491 int pnode;
1269 int uvhub; 1492 int uvhub;
1493 int have_hmaster;
1270 short socket = 0; 1494 short socket = 0;
1495 unsigned short socket_mask;
1496 unsigned char *uvhub_mask;
1271 struct bau_control *bcp; 1497 struct bau_control *bcp;
1272 struct uvhub_desc *bdp; 1498 struct uvhub_desc *bdp;
1273 struct socket_desc *sdp; 1499 struct socket_desc *sdp;
@@ -1278,7 +1504,7 @@ static void uv_init_per_cpu(int nuvhubs)
1278 short cpu_number[16]; 1504 short cpu_number[16];
1279 }; 1505 };
1280 struct uvhub_desc { 1506 struct uvhub_desc {
1281 short num_sockets; 1507 unsigned short socket_mask;
1282 short num_cpus; 1508 short num_cpus;
1283 short uvhub; 1509 short uvhub;
1284 short pnode; 1510 short pnode;
@@ -1286,57 +1512,84 @@ static void uv_init_per_cpu(int nuvhubs)
1286 }; 1512 };
1287 struct uvhub_desc *uvhub_descs; 1513 struct uvhub_desc *uvhub_descs;
1288 1514
1515 timeout_us = calculate_destination_timeout();
1516
1289 uvhub_descs = (struct uvhub_desc *) 1517 uvhub_descs = (struct uvhub_desc *)
1290 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); 1518 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1291 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); 1519 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1520 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1292 for_each_present_cpu(cpu) { 1521 for_each_present_cpu(cpu) {
1293 bcp = &per_cpu(bau_control, cpu); 1522 bcp = &per_cpu(bau_control, cpu);
1294 memset(bcp, 0, sizeof(struct bau_control)); 1523 memset(bcp, 0, sizeof(struct bau_control));
1295 spin_lock_init(&bcp->masks_lock);
1296 bcp->max_concurrent = uv_bau_max_concurrent;
1297 pnode = uv_cpu_hub_info(cpu)->pnode; 1524 pnode = uv_cpu_hub_info(cpu)->pnode;
1298 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 1525 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1526 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1299 bdp = &uvhub_descs[uvhub]; 1527 bdp = &uvhub_descs[uvhub];
1300 bdp->num_cpus++; 1528 bdp->num_cpus++;
1301 bdp->uvhub = uvhub; 1529 bdp->uvhub = uvhub;
1302 bdp->pnode = pnode; 1530 bdp->pnode = pnode;
1303 /* time interval to catch a hardware stay-busy bug */ 1531 /* kludge: 'assuming' one node per socket, and assuming that
1304 bcp->timeout_interval = millisec_2_cycles(3); 1532 disabling a socket just leaves a gap in node numbers */
1305 /* kludge: assume uv_hub.h is constant */ 1533 socket = (cpu_to_node(cpu) & 1);
1306 socket = (cpu_physical_id(cpu)>>5)&1; 1534 bdp->socket_mask |= (1 << socket);
1307 if (socket >= bdp->num_sockets)
1308 bdp->num_sockets = socket+1;
1309 sdp = &bdp->socket[socket]; 1535 sdp = &bdp->socket[socket];
1310 sdp->cpu_number[sdp->num_cpus] = cpu; 1536 sdp->cpu_number[sdp->num_cpus] = cpu;
1311 sdp->num_cpus++; 1537 sdp->num_cpus++;
1312 } 1538 }
1313 socket = 0; 1539 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1314 for_each_possible_blade(uvhub) { 1540 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1541 continue;
1542 have_hmaster = 0;
1315 bdp = &uvhub_descs[uvhub]; 1543 bdp = &uvhub_descs[uvhub];
1316 for (i = 0; i < bdp->num_sockets; i++) { 1544 socket_mask = bdp->socket_mask;
1317 sdp = &bdp->socket[i]; 1545 socket = 0;
1318 for (j = 0; j < sdp->num_cpus; j++) { 1546 while (socket_mask) {
1319 cpu = sdp->cpu_number[j]; 1547 if (!(socket_mask & 1))
1548 goto nextsocket;
1549 sdp = &bdp->socket[socket];
1550 for (i = 0; i < sdp->num_cpus; i++) {
1551 cpu = sdp->cpu_number[i];
1320 bcp = &per_cpu(bau_control, cpu); 1552 bcp = &per_cpu(bau_control, cpu);
1321 bcp->cpu = cpu; 1553 bcp->cpu = cpu;
1322 if (j == 0) { 1554 if (i == 0) {
1323 smaster = bcp; 1555 smaster = bcp;
1324 if (i == 0) 1556 if (!have_hmaster) {
1557 have_hmaster++;
1325 hmaster = bcp; 1558 hmaster = bcp;
1559 }
1326 } 1560 }
1327 bcp->cpus_in_uvhub = bdp->num_cpus; 1561 bcp->cpus_in_uvhub = bdp->num_cpus;
1328 bcp->cpus_in_socket = sdp->num_cpus; 1562 bcp->cpus_in_socket = sdp->num_cpus;
1329 bcp->socket_master = smaster; 1563 bcp->socket_master = smaster;
1564 bcp->uvhub = bdp->uvhub;
1330 bcp->uvhub_master = hmaster; 1565 bcp->uvhub_master = hmaster;
1331 for (k = 0; k < DEST_Q_SIZE; k++) 1566 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1332 bcp->socket_acknowledge_count[k] = 0; 1567 blade_processor_id;
1333 bcp->uvhub_cpu =
1334 uv_cpu_hub_info(cpu)->blade_processor_id;
1335 } 1568 }
1569nextsocket:
1336 socket++; 1570 socket++;
1571 socket_mask = (socket_mask >> 1);
1337 } 1572 }
1338 } 1573 }
1339 kfree(uvhub_descs); 1574 kfree(uvhub_descs);
1575 kfree(uvhub_mask);
1576 for_each_present_cpu(cpu) {
1577 bcp = &per_cpu(bau_control, cpu);
1578 bcp->baudisabled = 0;
1579 bcp->statp = &per_cpu(ptcstats, cpu);
1580 /* time interval to catch a hardware stay-busy bug */
1581 bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
1582 bcp->max_bau_concurrent = max_bau_concurrent;
1583 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1584 bcp->plugged_delay = plugged_delay;
1585 bcp->plugsb4reset = plugsb4reset;
1586 bcp->timeoutsb4reset = timeoutsb4reset;
1587 bcp->ipi_reset_limit = ipi_reset_limit;
1588 bcp->complete_threshold = complete_threshold;
1589 bcp->congested_response_us = congested_response_us;
1590 bcp->congested_reps = congested_reps;
1591 bcp->congested_period = congested_period;
1592 }
1340} 1593}
1341 1594
1342/* 1595/*
@@ -1361,10 +1614,11 @@ static int __init uv_bau_init(void)
1361 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1614 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
1362 GFP_KERNEL, cpu_to_node(cur_cpu)); 1615 GFP_KERNEL, cpu_to_node(cur_cpu));
1363 1616
1364 uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
1365 uv_nshift = uv_hub_info->m_val; 1617 uv_nshift = uv_hub_info->m_val;
1366 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1618 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1367 nuvhubs = uv_num_possible_blades(); 1619 nuvhubs = uv_num_possible_blades();
1620 spin_lock_init(&disable_lock);
1621 congested_cycles = microsec_2_cycles(congested_response_us);
1368 1622
1369 uv_init_per_cpu(nuvhubs); 1623 uv_init_per_cpu(nuvhubs);
1370 1624
@@ -1383,15 +1637,19 @@ static int __init uv_bau_init(void)
1383 alloc_intr_gate(vector, uv_bau_message_intr1); 1637 alloc_intr_gate(vector, uv_bau_message_intr1);
1384 1638
1385 for_each_possible_blade(uvhub) { 1639 for_each_possible_blade(uvhub) {
1386 pnode = uv_blade_to_pnode(uvhub); 1640 if (uv_blade_nr_possible_cpus(uvhub)) {
1387 /* INIT the bau */ 1641 pnode = uv_blade_to_pnode(uvhub);
1388 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, 1642 /* INIT the bau */
1389 ((unsigned long)1 << 63)); 1643 uv_write_global_mmr64(pnode,
1390 mmr = 1; /* should be 1 to broadcast to both sockets */ 1644 UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1391 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); 1645 ((unsigned long)1 << 63));
1646 mmr = 1; /* should be 1 to broadcast to both sockets */
1647 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
1648 mmr);
1649 }
1392 } 1650 }
1393 1651
1394 return 0; 1652 return 0;
1395} 1653}
1396core_initcall(uv_bau_init); 1654core_initcall(uv_bau_init);
1397core_initcall(uv_ptc_init); 1655fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index c652ef62742d..e2a595257390 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,6 +1,7 @@
1#include <linux/io.h> 1#include <linux/io.h>
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/pgtable.h>
4#include <asm/e820.h> 5#include <asm/e820.h>
5 6
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
@@ -37,3 +38,19 @@ unsigned long __trampinit setup_trampoline(void)
37 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
38 return virt_to_phys(trampoline_base); 39 return virt_to_phys(trampoline_base);
39} 40}
41
42void __init setup_trampoline_page_table(void)
43{
44#ifdef CONFIG_X86_32
45 /* Copy kernel address range */
46 clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
47 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
48 KERNEL_PGD_PTRS);
49
50 /* Initialize low mappings */
51 clone_pgd_range(trampoline_pg_dir,
52 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
53 min_t(unsigned long, KERNEL_PGD_PTRS,
54 KERNEL_PGD_BOUNDARY));
55#endif
56}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 725ef4d17cd5..60788dee0f8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -392,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) 392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
393 == NOTIFY_STOP) 393 == NOTIFY_STOP)
394 return; 394 return;
395
395#ifdef CONFIG_X86_LOCAL_APIC 396#ifdef CONFIG_X86_LOCAL_APIC
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
398 == NOTIFY_STOP)
399 return;
400
401#ifndef CONFIG_LOCKUP_DETECTOR
396 /* 402 /*
397 * Ok, so this is none of the documented NMI sources, 403 * Ok, so this is none of the documented NMI sources,
398 * so it must be the NMI watchdog. 404 * so it must be the NMI watchdog.
@@ -400,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
400 if (nmi_watchdog_tick(regs, reason)) 406 if (nmi_watchdog_tick(regs, reason))
401 return; 407 return;
402 if (!do_nmi_callback(regs, cpu)) 408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
403 unknown_nmi_error(reason, regs); 410 unknown_nmi_error(reason, regs);
404#else 411#else
405 unknown_nmi_error(reason, regs); 412 unknown_nmi_error(reason, regs);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..26a863a9c2a8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -626,6 +626,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
626 local_irq_restore(flags); 626 local_irq_restore(flags);
627} 627}
628 628
629static unsigned long long cyc2ns_suspend;
630
631void save_sched_clock_state(void)
632{
633 if (!sched_clock_stable)
634 return;
635
636 cyc2ns_suspend = sched_clock();
637}
638
639/*
640 * Even on processors with invariant TSC, TSC gets reset in some the
641 * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to
642 * arbitrary value (still sync'd across cpu's) during resume from such sleep
643 * states. To cope up with this, recompute the cyc2ns_offset for each cpu so
644 * that sched_clock() continues from the point where it was left off during
645 * suspend.
646 */
647void restore_sched_clock_state(void)
648{
649 unsigned long long offset;
650 unsigned long flags;
651 int cpu;
652
653 if (!sched_clock_stable)
654 return;
655
656 local_irq_save(flags);
657
658 __get_cpu_var(cyc2ns_offset) = 0;
659 offset = cyc2ns_suspend - sched_clock();
660
661 for_each_possible_cpu(cpu)
662 per_cpu(cyc2ns_offset, cpu) = offset;
663
664 local_irq_restore(flags);
665}
666
629#ifdef CONFIG_CPU_FREQ 667#ifdef CONFIG_CPU_FREQ
630 668
631/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency 669/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -751,7 +789,6 @@ static struct clocksource clocksource_tsc = {
751 .read = read_tsc, 789 .read = read_tsc,
752 .resume = resume_tsc, 790 .resume = resume_tsc,
753 .mask = CLOCKSOURCE_MASK(64), 791 .mask = CLOCKSOURCE_MASK(64),
754 .shift = 22,
755 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 792 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
756 CLOCK_SOURCE_MUST_VERIFY, 793 CLOCK_SOURCE_MUST_VERIFY,
757#ifdef CONFIG_X86_64 794#ifdef CONFIG_X86_64
@@ -845,8 +882,6 @@ __cpuinit int unsynchronized_tsc(void)
845 882
846static void __init init_tsc_clocksource(void) 883static void __init init_tsc_clocksource(void)
847{ 884{
848 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
849 clocksource_tsc.shift);
850 if (tsc_clocksource_reliable) 885 if (tsc_clocksource_reliable)
851 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 886 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
852 /* lower the rating if we already know its unstable: */ 887 /* lower the rating if we already know its unstable: */
@@ -854,7 +889,7 @@ static void __init init_tsc_clocksource(void)
854 clocksource_tsc.rating = 0; 889 clocksource_tsc.rating = 0;
855 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 890 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
856 } 891 }
857 clocksource_register(&clocksource_tsc); 892 clocksource_register_khz(&clocksource_tsc, tsc_khz);
858} 893}
859 894
860#ifdef CONFIG_X86_64 895#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
31 */ 31 */
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeature.h>
34#include <asm/msr-index.h>
34 35
35verify_cpu: 36verify_cpu:
36 pushfl # Save caller passed flags 37 pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
88 je verify_cpu_sse_ok 89 je verify_cpu_sse_ok
89 test %di,%di 90 test %di,%di
90 jz verify_cpu_no_longmode # only try to force SSE on AMD 91 jz verify_cpu_no_longmode # only try to force SSE on AMD
91 movl $0xc0010015,%ecx # HWCR 92 movl $MSR_K7_HWCR,%ecx
92 rdmsr 93 rdmsr
93 btr $15,%eax # enable SSE 94 btr $15,%eax # enable SSE
94 wrmsr 95 wrmsr
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, 76void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
77 u32 mult) 77 struct clocksource *clock, u32 mult)
78{ 78{
79 unsigned long flags; 79 unsigned long flags;
80 80
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
87 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
90 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 90 vsyscall_gtod_data.wall_to_monotonic = *wtm;
91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); 91 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 92 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
93} 93}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
169 * unlikely */ 169 * unlikely */
170time_t __vsyscall(1) vtime(time_t *t) 170time_t __vsyscall(1) vtime(time_t *t)
171{ 171{
172 struct timeval tv; 172 unsigned seq;
173 time_t result; 173 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
175 return time_syscall(t); 175 return time_syscall(t);
176 176
177 vgettimeofday(&tv, NULL); 177 do {
178 result = tv.tv_sec; 178 seq = read_seqbegin(&__vsyscall_gtod_data.lock);
179
180 result = __vsyscall_gtod_data.wall_time_sec;
181
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
183
179 if (t) 184 if (t)
180 *t = result; 185 *t = result;
181 return result; 186 return result;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 37e68fc5e24a..9c253bd65e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
16 */ 16 */
17u64 pcntxt_mask; 17u64 pcntxt_mask;
18 18
19/*
20 * Represents init state for the supported extended state.
21 */
22static struct xsave_struct *init_xstate_buf;
23
19struct _fpx_sw_bytes fx_sw_reserved; 24struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION 25#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32; 26struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif 27#endif
23 28
29static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
30
31/*
32 * If a processor implementation discern that a processor state component is
33 * in its initialized state it may modify the corresponding bit in the
34 * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
35 * layout in the case of xsaveopt. While presenting the xstate information to
36 * the user, we always ensure that the memory layout of a feature will be in
37 * the init state if the corresponding header bit is zero. This is to ensure
38 * that the user doesn't see some stale state in the memory layout during
39 * signal handling, debugging etc.
40 */
41void __sanitize_i387_state(struct task_struct *tsk)
42{
43 u64 xstate_bv;
44 int feature_bit = 0x2;
45 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
46
47 if (!fx)
48 return;
49
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
51
52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
53
54 /*
55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date.
57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return;
60
61 /*
62 * FP is in init state
63 */
64 if (!(xstate_bv & XSTATE_FP)) {
65 fx->cwd = 0x37f;
66 fx->swd = 0;
67 fx->twd = 0;
68 fx->fop = 0;
69 fx->rip = 0;
70 fx->rdp = 0;
71 memset(&fx->st_space[0], 0, 128);
72 }
73
74 /*
75 * SSE is in init state
76 */
77 if (!(xstate_bv & XSTATE_SSE))
78 memset(&fx->xmm_space[0], 0, 256);
79
80 xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
81
82 /*
83 * Update all the other memory layouts for which the corresponding
84 * header bit is in the init state.
85 */
86 while (xstate_bv) {
87 if (xstate_bv & 0x1) {
88 int offset = xstate_offsets[feature_bit];
89 int size = xstate_sizes[feature_bit];
90
91 memcpy(((void *) fx) + offset,
92 ((void *) init_xstate_buf) + offset,
93 size);
94 }
95
96 xstate_bv >>= 1;
97 feature_bit++;
98 }
99}
100
24/* 101/*
25 * Check for the presence of extended state information in the 102 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext. 103 * user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
36 113
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], 114 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes)); 115 sizeof(struct _fpx_sw_bytes));
39
40 if (err) 116 if (err)
41 return err; 117 return -EFAULT;
42 118
43 /* 119 /*
44 * First Magic check failed. 120 * First Magic check failed.
45 */ 121 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) 122 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1; 123 return -EINVAL;
48 124
49 /* 125 /*
50 * Check for error scenarios. 126 * Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
52 if (fx_sw_user->xstate_size < min_xstate_size || 128 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size || 129 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size) 130 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1; 131 return -EINVAL;
56 132
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) + 133 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size - 134 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE)); 135 FP_XSTATE_MAGIC2_SIZE));
136 if (err)
137 return err;
60 /* 138 /*
61 * Check for the presence of second magic word at the end of memory 139 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy 140 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information 141 * fpstate layout with out copying the extended state information
64 * in the memory layout. 142 * in the memory layout.
65 */ 143 */
66 if (err || magic2 != FP_XSTATE_MAGIC2) 144 if (magic2 != FP_XSTATE_MAGIC2)
67 return -1; 145 return -EFAULT;
68 146
69 return 0; 147 return 0;
70} 148}
@@ -91,14 +169,6 @@ int save_i387_xstate(void __user *buf)
91 return 0; 169 return 0;
92 170
93 if (task_thread_info(tsk)->status & TS_USEDFPU) { 171 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (use_xsave()) 172 if (use_xsave())
103 err = xsave_user(buf); 173 err = xsave_user(buf);
104 else 174 else
@@ -109,6 +179,7 @@ int save_i387_xstate(void __user *buf)
109 task_thread_info(tsk)->status &= ~TS_USEDFPU; 179 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts(); 180 stts();
111 } else { 181 } else {
182 sanitize_i387_state(tsk);
112 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, 183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
113 xstate_size)) 184 xstate_size))
114 return -1; 185 return -1;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
184 * init the state skipped by the user. 255 * init the state skipped by the user.
185 */ 256 */
186 mask = pcntxt_mask & ~mask; 257 mask = pcntxt_mask & ~mask;
187 258 if (unlikely(mask))
188 xrstor_state(init_xstate_buf, mask); 259 xrstor_state(init_xstate_buf, mask);
189 260
190 return 0; 261 return 0;
191 262
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
274#endif 345#endif
275} 346}
276 347
277/*
278 * Represents init state for the supported extended state.
279 */
280struct xsave_struct *init_xstate_buf;
281
282#ifdef CONFIG_X86_64 348#ifdef CONFIG_X86_64
283unsigned int sig_xstate_size = sizeof(struct _fpstate); 349unsigned int sig_xstate_size = sizeof(struct _fpstate);
284#endif 350#endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
286/* 352/*
287 * Enable the extended processor state save/restore feature 353 * Enable the extended processor state save/restore feature
288 */ 354 */
289void __cpuinit xsave_init(void) 355static inline void xstate_enable(void)
290{ 356{
291 if (!cpu_has_xsave)
292 return;
293
294 set_in_cr4(X86_CR4_OSXSAVE); 357 set_in_cr4(X86_CR4_OSXSAVE);
295
296 /*
297 * Enable all the features that the HW is capable of
298 * and the Linux kernel is aware of.
299 */
300 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); 358 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
301} 359}
302 360
303/* 361/*
362 * Record the offsets and sizes of different state managed by the xsave
363 * memory layout.
364 */
365static void __init setup_xstate_features(void)
366{
367 int eax, ebx, ecx, edx, leaf = 0x2;
368
369 xstate_features = fls64(pcntxt_mask);
370 xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
371 xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
372
373 do {
374 cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
375
376 if (eax == 0)
377 break;
378
379 xstate_offsets[leaf] = ebx;
380 xstate_sizes[leaf] = eax;
381
382 leaf++;
383 } while (1);
384}
385
386/*
304 * setup the xstate image representing the init state 387 * setup the xstate image representing the init state
305 */ 388 */
306static void __init setup_xstate_init(void) 389static void __init setup_xstate_init(void)
307{ 390{
391 setup_xstate_features();
392
393 /*
394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave
396 */
308 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem(xstate_size);
309 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399
400 clts();
401 /*
402 * Init all the features state with header_bv being 0x0
403 */
404 xrstor_state(init_xstate_buf, -1);
405 /*
406 * Dump the init state again. This is to identify the init state
407 * of any feature which is not represented by all zero's.
408 */
409 xsave_state(init_xstate_buf, -1);
410 stts();
310} 411}
311 412
312/* 413/*
313 * Enable and initialize the xsave feature. 414 * Enable and initialize the xsave feature.
314 */ 415 */
315void __ref xsave_cntxt_init(void) 416static void __init xstate_enable_boot_cpu(void)
316{ 417{
317 unsigned int eax, ebx, ecx, edx; 418 unsigned int eax, ebx, ecx, edx;
318 419
319 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 420 if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
421 WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
422 return;
423 }
424
425 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
320 pcntxt_mask = eax + ((u64)edx << 32); 426 pcntxt_mask = eax + ((u64)edx << 32);
321 427
322 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 428 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
329 * Support only the state known to OS. 435 * Support only the state known to OS.
330 */ 436 */
331 pcntxt_mask = pcntxt_mask & XCNTXT_MASK; 437 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
332 xsave_init(); 438
439 xstate_enable();
333 440
334 /* 441 /*
335 * Recompute the context size for enabled features 442 * Recompute the context size for enabled features
336 */ 443 */
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 444 cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 445 xstate_size = ebx;
339 446
340 update_regset_xstate_info(xstate_size, pcntxt_mask); 447 update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
346 "cntxt size 0x%x\n", 453 "cntxt size 0x%x\n",
347 pcntxt_mask, xstate_size); 454 pcntxt_mask, xstate_size);
348} 455}
456
457/*
458 * For the very first instance, this calls xstate_enable_boot_cpu();
459 * for all subsequent instances, this calls xstate_enable().
460 *
461 * This is somewhat obfuscated due to the lack of powerful enough
462 * overrides for the section checks.
463 */
464void __cpuinit xsave_init(void)
465{
466 static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
467 void (*this_func)(void);
468
469 if (!cpu_has_xsave)
470 return;
471
472 this_func = next_func;
473 next_func = xstate_enable;
474 this_func();
475}