aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorMarcelo Tosatti <mtosatti@redhat.com>2013-03-04 18:10:32 -0500
committerMarcelo Tosatti <mtosatti@redhat.com>2013-03-04 18:10:32 -0500
commitee2c25efdd46d7ed5605d6fe877bdf4b47a4ab2e (patch)
tree35890281e93e667a8e262d76ef250025eb30a8c1 /arch/x86/kernel
parent3ab66e8a455a4877889c65a848f2fb32be502f2c (diff)
parent6dbe51c251a327e012439c4772097a13df43c5b8 (diff)
Merge branch 'master' into queue
* master: (15791 commits) Linux 3.9-rc1 btrfs/raid56: Add missing #include <linux/vmalloc.h> fix compat_sys_rt_sigprocmask() SUNRPC: One line comment fix ext4: enable quotas before orphan cleanup ext4: don't allow quota mount options when quota feature enabled ext4: fix a warning from sparse check for ext4_dir_llseek ext4: convert number of blocks to clusters properly ext4: fix possible memory leak in ext4_remount() jbd2: fix ERR_PTR dereference in jbd2__journal_start metag: Provide dma_get_sgtable() metag: prom.h: remove declaration of metag_dt_memblock_reserve() metag: copy devicetree to non-init memory metag: cleanup metag_ksyms.c includes metag: move mm/init.c exports out of metag_ksyms.c metag: move usercopy.c exports out of metag_ksyms.c metag: move setup.c exports out of metag_ksyms.c metag: move kick.c exports out of metag_ksyms.c metag: move traps.c exports out of metag_ksyms.c metag: move irq enable out of irqflags.h on SMP ... Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> Conflicts: arch/x86/kernel/kvmclock.c
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/amd_gart_64.c5
-rw-r--r--arch/x86/kernel/apb_timer.c10
-rw-r--r--arch/x86/kernel/apic/apic.c30
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c457
-rw-r--r--arch/x86/kernel/apic/ipi.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c21
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c206
-rw-r--r--arch/x86/kernel/apm_32.c68
-rw-r--r--arch/x86/kernel/cpu/amd.c68
-rw-r--r--arch/x86/kernel/cpu/bugs.c27
-rw-r--r--arch/x86/kernel/cpu/common.c17
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c7
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c225
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c54
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event.h25
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c322
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c9
-rw-r--r--arch/x86/kernel/cpu/vmware.c13
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/entry_32.S56
-rw-r--r--arch/x86/kernel/entry_64.S51
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head.c53
-rw-r--r--arch/x86/kernel/head32.c21
-rw-r--r--arch/x86/kernel/head64.c146
-rw-r--r--arch/x86/kernel/head_32.S113
-rw-r--r--arch/x86/kernel/head_64.S216
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c1
-rw-r--r--arch/x86/kernel/ioport.c3
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/kprobes/Makefile7
-rw-r--r--arch/x86/kernel/kprobes/common.h (renamed from arch/x86/kernel/kprobes-common.h)11
-rw-r--r--arch/x86/kernel/kprobes/core.c (renamed from arch/x86/kernel/kprobes.c)84
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c93
-rw-r--r--arch/x86/kernel/kprobes/opt.c (renamed from arch/x86/kernel/kprobes-opt.c)2
-rw-r--r--arch/x86/kernel/kvm.c24
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c171
-rw-r--r--arch/x86/kernel/microcode_core.c7
-rw-r--r--arch/x86/kernel/microcode_core_early.c76
-rw-r--r--arch/x86/kernel/microcode_intel.c198
-rw-r--r--arch/x86/kernel/microcode_intel_early.c796
-rw-r--r--arch/x86/kernel/microcode_intel_lib.c174
-rw-r--r--arch/x86/kernel/msr.c12
-rw-r--r--arch/x86/kernel/nmi.c1
-rw-r--r--arch/x86/kernel/pci-dma.c4
-rw-r--r--arch/x86/kernel/process.c122
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/ptrace.c2
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/quirks.c4
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/rtc.c1
-rw-r--r--arch/x86/kernel/setup.c367
-rw-r--r--arch/x86/kernel/signal.c213
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/step.c9
-rw-r--r--arch/x86/kernel/sys_x86_64.c2
-rw-r--r--arch/x86/kernel/traps.c15
-rw-r--r--arch/x86/kernel/tsc.c3
-rw-r--r--arch/x86/kernel/uprobes.c4
-rw-r--r--arch/x86/kernel/vm86_32.c8
-rw-r--r--arch/x86/kernel/vsyscall_64.c110
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c3
-rw-r--r--arch/x86/kernel/x86_init.c28
85 files changed, 3100 insertions, 1869 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34e923a53762..7bd3bd310106 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -65,8 +65,7 @@ obj-$(CONFIG_X86_TSC) += trace_clock.o
65obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 65obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
66obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 66obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
67obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 67obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
68obj-$(CONFIG_KPROBES) += kprobes.o 68obj-y += kprobes/
69obj-$(CONFIG_OPTPROBES) += kprobes-opt.o
70obj-$(CONFIG_MODULES) += module.o 69obj-$(CONFIG_MODULES) += module.o
71obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 70obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
72obj-$(CONFIG_KGDB) += kgdb.o 71obj-$(CONFIG_KGDB) += kgdb.o
@@ -88,6 +87,9 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
88 87
89obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 88obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
90 89
90obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o
91obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o
92obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o
91microcode-y := microcode_core.o 93microcode-y := microcode_core.o
92microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 94microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
93microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 95microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e48cafcf92ae..230c8ea878e5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
51 51
52#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
53# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
55#endif /* X86 */ 54#endif /* X86 */
56 55
57#define BAD_MADT_ENTRY(entry, end) ( \ 56#define BAD_MADT_ENTRY(entry, end) ( \
@@ -697,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic);
697 696
698int acpi_unmap_lsapic(int cpu) 697int acpi_unmap_lsapic(int cpu)
699{ 698{
699#ifdef CONFIG_ACPI_NUMA
700 set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
701#endif
702
700 per_cpu(x86_cpu_to_apicid, cpu) = -1; 703 per_cpu(x86_cpu_to_apicid, cpu) = -1;
701 set_cpu_present(cpu, false); 704 set_cpu_present(cpu, false);
702 num_processors--; 705 num_processors--;
@@ -1706,3 +1709,9 @@ int __acpi_release_global_lock(unsigned int *lock)
1706 } while (unlikely (val != old)); 1709 } while (unlikely (val != old));
1707 return old & 0x1; 1710 return old & 0x1;
1708} 1711}
1712
1713void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
1714{
1715 e820_add_region(addr, size, E820_ACPI);
1716 update_e820();
1717}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d5e0d717005a..0532f5d6e4ef 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void)
69 69
70#ifndef CONFIG_64BIT 70#ifndef CONFIG_64BIT
71 header->pmode_entry = (u32)&wakeup_pmode_return; 71 header->pmode_entry = (u32)&wakeup_pmode_return;
72 header->pmode_cr3 = (u32)__pa(&initial_page_table); 72 header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
73 saved_magic = 0x12345678; 73 saved_magic = 0x12345678;
74#else /* CONFIG_64BIT */ 74#else /* CONFIG_64BIT */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cbd..b574b295a2f9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
768 aper_base = info.aper_base; 768 aper_base = info.aper_base;
769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
770 770
771 if (end_pfn > max_low_pfn_mapped) { 771 start_pfn = PFN_DOWN(aper_base);
772 start_pfn = (aper_base>>PAGE_SHIFT); 772 if (!pfn_range_is_mapped(start_pfn, end_pfn))
773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
774 }
775 774
776 pr_info("PCI-DMA: using GART IOMMU.\n"); 775 pr_info("PCI-DMA: using GART IOMMU.\n");
777 iommu_size = check_iommu_size(info.aper_base, aper_size); 776 iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index afdc3f756dea..c9876efecafb 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -240,7 +240,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
240 dw_apb_clockevent_pause(adev->timer); 240 dw_apb_clockevent_pause(adev->timer);
241 if (system_state == SYSTEM_RUNNING) { 241 if (system_state == SYSTEM_RUNNING) {
242 pr_debug("skipping APBT CPU %lu offline\n", cpu); 242 pr_debug("skipping APBT CPU %lu offline\n", cpu);
243 } else if (adev) { 243 } else {
244 pr_debug("APBT clockevent for cpu %lu offline\n", cpu); 244 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
245 dw_apb_clockevent_stop(adev->timer); 245 dw_apb_clockevent_stop(adev->timer);
246 } 246 }
@@ -311,7 +311,6 @@ void __init apbt_time_init(void)
311#ifdef CONFIG_SMP 311#ifdef CONFIG_SMP
312 int i; 312 int i;
313 struct sfi_timer_table_entry *p_mtmr; 313 struct sfi_timer_table_entry *p_mtmr;
314 unsigned int percpu_timer;
315 struct apbt_dev *adev; 314 struct apbt_dev *adev;
316#endif 315#endif
317 316
@@ -346,13 +345,10 @@ void __init apbt_time_init(void)
346 return; 345 return;
347 } 346 }
348 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); 347 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
349 if (num_possible_cpus() <= sfi_mtimer_num) { 348 if (num_possible_cpus() <= sfi_mtimer_num)
350 percpu_timer = 1;
351 apbt_num_timers_used = num_possible_cpus(); 349 apbt_num_timers_used = num_possible_cpus();
352 } else { 350 else
353 percpu_timer = 0;
354 apbt_num_timers_used = 1; 351 apbt_num_timers_used = 1;
355 }
356 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); 352 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
357 353
358 /* here we set up per CPU timer data structure */ 354 /* here we set up per CPU timer data structure */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b994cc84aa7e..904611bf0e5a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -131,7 +131,7 @@ static int __init parse_lapic(char *arg)
131{ 131{
132 if (config_enabled(CONFIG_X86_32) && !arg) 132 if (config_enabled(CONFIG_X86_32) && !arg)
133 force_enable_local_apic = 1; 133 force_enable_local_apic = 1;
134 else if (!strncmp(arg, "notscdeadline", 13)) 134 else if (arg && !strncmp(arg, "notscdeadline", 13))
135 setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); 135 setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
136 return 0; 136 return 0;
137} 137}
@@ -1477,8 +1477,7 @@ void __init bsp_end_local_APIC_setup(void)
1477 * Now that local APIC setup is completed for BP, configure the fault 1477 * Now that local APIC setup is completed for BP, configure the fault
1478 * handling for interrupt remapping. 1478 * handling for interrupt remapping.
1479 */ 1479 */
1480 if (irq_remapping_enabled) 1480 irq_remap_enable_fault_handling();
1481 irq_remap_enable_fault_handling();
1482 1481
1483} 1482}
1484 1483
@@ -2251,8 +2250,7 @@ static int lapic_suspend(void)
2251 local_irq_save(flags); 2250 local_irq_save(flags);
2252 disable_local_APIC(); 2251 disable_local_APIC();
2253 2252
2254 if (irq_remapping_enabled) 2253 irq_remapping_disable();
2255 irq_remapping_disable();
2256 2254
2257 local_irq_restore(flags); 2255 local_irq_restore(flags);
2258 return 0; 2256 return 0;
@@ -2268,16 +2266,15 @@ static void lapic_resume(void)
2268 return; 2266 return;
2269 2267
2270 local_irq_save(flags); 2268 local_irq_save(flags);
2271 if (irq_remapping_enabled) { 2269
2272 /* 2270 /*
2273 * IO-APIC and PIC have their own resume routines. 2271 * IO-APIC and PIC have their own resume routines.
2274 * We just mask them here to make sure the interrupt 2272 * We just mask them here to make sure the interrupt
2275 * subsystem is completely quiet while we enable x2apic 2273 * subsystem is completely quiet while we enable x2apic
2276 * and interrupt-remapping. 2274 * and interrupt-remapping.
2277 */ 2275 */
2278 mask_ioapic_entries(); 2276 mask_ioapic_entries();
2279 legacy_pic->mask_all(); 2277 legacy_pic->mask_all();
2280 }
2281 2278
2282 if (x2apic_mode) 2279 if (x2apic_mode)
2283 enable_x2apic(); 2280 enable_x2apic();
@@ -2320,8 +2317,7 @@ static void lapic_resume(void)
2320 apic_write(APIC_ESR, 0); 2317 apic_write(APIC_ESR, 0);
2321 apic_read(APIC_ESR); 2318 apic_read(APIC_ESR);
2322 2319
2323 if (irq_remapping_enabled) 2320 irq_remapping_reenable(x2apic_mode);
2324 irq_remapping_reenable(x2apic_mode);
2325 2321
2326 local_irq_restore(flags); 2322 local_irq_restore(flags);
2327} 2323}
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 9c2aa89a11cb..9a9110918ca7 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -28,6 +28,7 @@
28#include <asm/apic.h> 28#include <asm/apic.h>
29#include <asm/ipi.h> 29#include <asm/ipi.h>
30#include <asm/apic_flat_64.h> 30#include <asm/apic_flat_64.h>
31#include <asm/pgtable.h>
31 32
32static int numachip_system __read_mostly; 33static int numachip_system __read_mostly;
33 34
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b739d398bb29..9ed796ccc32c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -68,22 +68,6 @@
68#define for_each_irq_pin(entry, head) \ 68#define for_each_irq_pin(entry, head) \
69 for (entry = head; entry; entry = entry->next) 69 for (entry = head; entry; entry = entry->next)
70 70
71#ifdef CONFIG_IRQ_REMAP
72static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
73static inline bool irq_remapped(struct irq_cfg *cfg)
74{
75 return cfg->irq_2_iommu.iommu != NULL;
76}
77#else
78static inline bool irq_remapped(struct irq_cfg *cfg)
79{
80 return false;
81}
82static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
83{
84}
85#endif
86
87/* 71/*
88 * Is the SiS APIC rmw bug present ? 72 * Is the SiS APIC rmw bug present ?
89 * -1 = don't know, 0 = no, 1 = yes 73 * -1 = don't know, 0 = no, 1 = yes
@@ -300,9 +284,9 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
300 return cfg; 284 return cfg;
301} 285}
302 286
303static int alloc_irq_from(unsigned int from, int node) 287static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
304{ 288{
305 return irq_alloc_desc_from(from, node); 289 return irq_alloc_descs_from(from, count, node);
306} 290}
307 291
308static void free_irq_at(unsigned int at, struct irq_cfg *cfg) 292static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
@@ -326,7 +310,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
326 + (mpc_ioapic_addr(idx) & ~PAGE_MASK); 310 + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
327} 311}
328 312
329static inline void io_apic_eoi(unsigned int apic, unsigned int vector) 313void io_apic_eoi(unsigned int apic, unsigned int vector)
330{ 314{
331 struct io_apic __iomem *io_apic = io_apic_base(apic); 315 struct io_apic __iomem *io_apic = io_apic_base(apic);
332 writel(vector, &io_apic->eoi); 316 writel(vector, &io_apic->eoi);
@@ -573,19 +557,10 @@ static void unmask_ioapic_irq(struct irq_data *data)
573 * Otherwise, we simulate the EOI message manually by changing the trigger 557 * Otherwise, we simulate the EOI message manually by changing the trigger
574 * mode to edge and then back to level, with RTE being masked during this. 558 * mode to edge and then back to level, with RTE being masked during this.
575 */ 559 */
576static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg) 560void native_eoi_ioapic_pin(int apic, int pin, int vector)
577{ 561{
578 if (mpc_ioapic_ver(apic) >= 0x20) { 562 if (mpc_ioapic_ver(apic) >= 0x20) {
579 /* 563 io_apic_eoi(apic, vector);
580 * Intr-remapping uses pin number as the virtual vector
581 * in the RTE. Actual vector is programmed in
582 * intr-remapping table entry. Hence for the io-apic
583 * EOI we use the pin number.
584 */
585 if (cfg && irq_remapped(cfg))
586 io_apic_eoi(apic, pin);
587 else
588 io_apic_eoi(apic, vector);
589 } else { 564 } else {
590 struct IO_APIC_route_entry entry, entry1; 565 struct IO_APIC_route_entry entry, entry1;
591 566
@@ -606,14 +581,15 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
606 } 581 }
607} 582}
608 583
609static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) 584void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
610{ 585{
611 struct irq_pin_list *entry; 586 struct irq_pin_list *entry;
612 unsigned long flags; 587 unsigned long flags;
613 588
614 raw_spin_lock_irqsave(&ioapic_lock, flags); 589 raw_spin_lock_irqsave(&ioapic_lock, flags);
615 for_each_irq_pin(entry, cfg->irq_2_pin) 590 for_each_irq_pin(entry, cfg->irq_2_pin)
616 __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg); 591 x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
592 cfg->vector);
617 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 593 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
618} 594}
619 595
@@ -650,7 +626,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
650 } 626 }
651 627
652 raw_spin_lock_irqsave(&ioapic_lock, flags); 628 raw_spin_lock_irqsave(&ioapic_lock, flags);
653 __eoi_ioapic_pin(apic, pin, entry.vector, NULL); 629 x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
654 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 630 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
655 } 631 }
656 632
@@ -1304,25 +1280,18 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1304 fasteoi = false; 1280 fasteoi = false;
1305 } 1281 }
1306 1282
1307 if (irq_remapped(cfg)) { 1283 if (setup_remapped_irq(irq, cfg, chip))
1308 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1309 irq_remap_modify_chip_defaults(chip);
1310 fasteoi = trigger != 0; 1284 fasteoi = trigger != 0;
1311 }
1312 1285
1313 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; 1286 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
1314 irq_set_chip_and_handler_name(irq, chip, hdl, 1287 irq_set_chip_and_handler_name(irq, chip, hdl,
1315 fasteoi ? "fasteoi" : "edge"); 1288 fasteoi ? "fasteoi" : "edge");
1316} 1289}
1317 1290
1318static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, 1291int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
1319 unsigned int destination, int vector, 1292 unsigned int destination, int vector,
1320 struct io_apic_irq_attr *attr) 1293 struct io_apic_irq_attr *attr)
1321{ 1294{
1322 if (irq_remapping_enabled)
1323 return setup_ioapic_remapped_entry(irq, entry, destination,
1324 vector, attr);
1325
1326 memset(entry, 0, sizeof(*entry)); 1295 memset(entry, 0, sizeof(*entry));
1327 1296
1328 entry->delivery_mode = apic->irq_delivery_mode; 1297 entry->delivery_mode = apic->irq_delivery_mode;
@@ -1370,8 +1339,8 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
1370 attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, 1339 attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
1371 cfg->vector, irq, attr->trigger, attr->polarity, dest); 1340 cfg->vector, irq, attr->trigger, attr->polarity, dest);
1372 1341
1373 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { 1342 if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
1374 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1343 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1375 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); 1344 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1376 __clear_irq_vector(irq, cfg); 1345 __clear_irq_vector(irq, cfg);
1377 1346
@@ -1479,9 +1448,6 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1479 struct IO_APIC_route_entry entry; 1448 struct IO_APIC_route_entry entry;
1480 unsigned int dest; 1449 unsigned int dest;
1481 1450
1482 if (irq_remapping_enabled)
1483 return;
1484
1485 memset(&entry, 0, sizeof(entry)); 1451 memset(&entry, 0, sizeof(entry));
1486 1452
1487 /* 1453 /*
@@ -1513,9 +1479,63 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1513 ioapic_write_entry(ioapic_idx, pin, entry); 1479 ioapic_write_entry(ioapic_idx, pin, entry);
1514} 1480}
1515 1481
1516__apicdebuginit(void) print_IO_APIC(int ioapic_idx) 1482void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
1517{ 1483{
1518 int i; 1484 int i;
1485
1486 pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
1487
1488 for (i = 0; i <= nr_entries; i++) {
1489 struct IO_APIC_route_entry entry;
1490
1491 entry = ioapic_read_entry(apic, i);
1492
1493 pr_debug(" %02x %02X ", i, entry.dest);
1494 pr_cont("%1d %1d %1d %1d %1d "
1495 "%1d %1d %02X\n",
1496 entry.mask,
1497 entry.trigger,
1498 entry.irr,
1499 entry.polarity,
1500 entry.delivery_status,
1501 entry.dest_mode,
1502 entry.delivery_mode,
1503 entry.vector);
1504 }
1505}
1506
1507void intel_ir_io_apic_print_entries(unsigned int apic,
1508 unsigned int nr_entries)
1509{
1510 int i;
1511
1512 pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
1513
1514 for (i = 0; i <= nr_entries; i++) {
1515 struct IR_IO_APIC_route_entry *ir_entry;
1516 struct IO_APIC_route_entry entry;
1517
1518 entry = ioapic_read_entry(apic, i);
1519
1520 ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
1521
1522 pr_debug(" %02x %04X ", i, ir_entry->index);
1523 pr_cont("%1d %1d %1d %1d %1d "
1524 "%1d %1d %X %02X\n",
1525 ir_entry->format,
1526 ir_entry->mask,
1527 ir_entry->trigger,
1528 ir_entry->irr,
1529 ir_entry->polarity,
1530 ir_entry->delivery_status,
1531 ir_entry->index2,
1532 ir_entry->zero,
1533 ir_entry->vector);
1534 }
1535}
1536
1537__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1538{
1519 union IO_APIC_reg_00 reg_00; 1539 union IO_APIC_reg_00 reg_00;
1520 union IO_APIC_reg_01 reg_01; 1540 union IO_APIC_reg_01 reg_01;
1521 union IO_APIC_reg_02 reg_02; 1541 union IO_APIC_reg_02 reg_02;
@@ -1568,58 +1588,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1568 1588
1569 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1589 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1570 1590
1571 if (irq_remapping_enabled) { 1591 x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
1572 printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
1573 " Pol Stat Indx2 Zero Vect:\n");
1574 } else {
1575 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1576 " Stat Dmod Deli Vect:\n");
1577 }
1578
1579 for (i = 0; i <= reg_01.bits.entries; i++) {
1580 if (irq_remapping_enabled) {
1581 struct IO_APIC_route_entry entry;
1582 struct IR_IO_APIC_route_entry *ir_entry;
1583
1584 entry = ioapic_read_entry(ioapic_idx, i);
1585 ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
1586 printk(KERN_DEBUG " %02x %04X ",
1587 i,
1588 ir_entry->index
1589 );
1590 pr_cont("%1d %1d %1d %1d %1d "
1591 "%1d %1d %X %02X\n",
1592 ir_entry->format,
1593 ir_entry->mask,
1594 ir_entry->trigger,
1595 ir_entry->irr,
1596 ir_entry->polarity,
1597 ir_entry->delivery_status,
1598 ir_entry->index2,
1599 ir_entry->zero,
1600 ir_entry->vector
1601 );
1602 } else {
1603 struct IO_APIC_route_entry entry;
1604
1605 entry = ioapic_read_entry(ioapic_idx, i);
1606 printk(KERN_DEBUG " %02x %02X ",
1607 i,
1608 entry.dest
1609 );
1610 pr_cont("%1d %1d %1d %1d %1d "
1611 "%1d %1d %02X\n",
1612 entry.mask,
1613 entry.trigger,
1614 entry.irr,
1615 entry.polarity,
1616 entry.delivery_status,
1617 entry.dest_mode,
1618 entry.delivery_mode,
1619 entry.vector
1620 );
1621 }
1622 }
1623} 1592}
1624 1593
1625__apicdebuginit(void) print_IO_APICs(void) 1594__apicdebuginit(void) print_IO_APICs(void)
@@ -1921,30 +1890,14 @@ void __init enable_IO_APIC(void)
1921 clear_IO_APIC(); 1890 clear_IO_APIC();
1922} 1891}
1923 1892
1924/* 1893void native_disable_io_apic(void)
1925 * Not an __init, needed by the reboot code
1926 */
1927void disable_IO_APIC(void)
1928{ 1894{
1929 /* 1895 /*
1930 * Clear the IO-APIC before rebooting:
1931 */
1932 clear_IO_APIC();
1933
1934 if (!legacy_pic->nr_legacy_irqs)
1935 return;
1936
1937 /*
1938 * If the i8259 is routed through an IOAPIC 1896 * If the i8259 is routed through an IOAPIC
1939 * Put that IOAPIC in virtual wire mode 1897 * Put that IOAPIC in virtual wire mode
1940 * so legacy interrupts can be delivered. 1898 * so legacy interrupts can be delivered.
1941 *
1942 * With interrupt-remapping, for now we will use virtual wire A mode,
1943 * as virtual wire B is little complex (need to configure both
1944 * IOAPIC RTE as well as interrupt-remapping table entry).
1945 * As this gets called during crash dump, keep this simple for now.
1946 */ 1899 */
1947 if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { 1900 if (ioapic_i8259.pin != -1) {
1948 struct IO_APIC_route_entry entry; 1901 struct IO_APIC_route_entry entry;
1949 1902
1950 memset(&entry, 0, sizeof(entry)); 1903 memset(&entry, 0, sizeof(entry));
@@ -1964,12 +1917,25 @@ void disable_IO_APIC(void)
1964 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); 1917 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1965 } 1918 }
1966 1919
1920 if (cpu_has_apic || apic_from_smp_config())
1921 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1922
1923}
1924
1925/*
1926 * Not an __init, needed by the reboot code
1927 */
1928void disable_IO_APIC(void)
1929{
1967 /* 1930 /*
1968 * Use virtual wire A mode when interrupt remapping is enabled. 1931 * Clear the IO-APIC before rebooting:
1969 */ 1932 */
1970 if (cpu_has_apic || apic_from_smp_config()) 1933 clear_IO_APIC();
1971 disconnect_bsp_APIC(!irq_remapping_enabled && 1934
1972 ioapic_i8259.pin != -1); 1935 if (!legacy_pic->nr_legacy_irqs)
1936 return;
1937
1938 x86_io_apic_ops.disable();
1973} 1939}
1974 1940
1975#ifdef CONFIG_X86_32 1941#ifdef CONFIG_X86_32
@@ -2322,12 +2288,8 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2322 2288
2323 apic = entry->apic; 2289 apic = entry->apic;
2324 pin = entry->pin; 2290 pin = entry->pin;
2325 /* 2291
2326 * With interrupt-remapping, destination information comes 2292 io_apic_write(apic, 0x11 + pin*2, dest);
2327 * from interrupt-remapping table entry.
2328 */
2329 if (!irq_remapped(cfg))
2330 io_apic_write(apic, 0x11 + pin*2, dest);
2331 reg = io_apic_read(apic, 0x10 + pin*2); 2293 reg = io_apic_read(apic, 0x10 + pin*2);
2332 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 2294 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2333 reg |= vector; 2295 reg |= vector;
@@ -2369,9 +2331,10 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2369 return 0; 2331 return 0;
2370} 2332}
2371 2333
2372static int 2334
2373ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, 2335int native_ioapic_set_affinity(struct irq_data *data,
2374 bool force) 2336 const struct cpumask *mask,
2337 bool force)
2375{ 2338{
2376 unsigned int dest, irq = data->irq; 2339 unsigned int dest, irq = data->irq;
2377 unsigned long flags; 2340 unsigned long flags;
@@ -2548,33 +2511,6 @@ static void ack_apic_level(struct irq_data *data)
2548 ioapic_irqd_unmask(data, cfg, masked); 2511 ioapic_irqd_unmask(data, cfg, masked);
2549} 2512}
2550 2513
2551#ifdef CONFIG_IRQ_REMAP
2552static void ir_ack_apic_edge(struct irq_data *data)
2553{
2554 ack_APIC_irq();
2555}
2556
2557static void ir_ack_apic_level(struct irq_data *data)
2558{
2559 ack_APIC_irq();
2560 eoi_ioapic_irq(data->irq, data->chip_data);
2561}
2562
2563static void ir_print_prefix(struct irq_data *data, struct seq_file *p)
2564{
2565 seq_printf(p, " IR-%s", data->chip->name);
2566}
2567
2568static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
2569{
2570 chip->irq_print_chip = ir_print_prefix;
2571 chip->irq_ack = ir_ack_apic_edge;
2572 chip->irq_eoi = ir_ack_apic_level;
2573
2574 chip->irq_set_affinity = set_remapped_irq_affinity;
2575}
2576#endif /* CONFIG_IRQ_REMAP */
2577
2578static struct irq_chip ioapic_chip __read_mostly = { 2514static struct irq_chip ioapic_chip __read_mostly = {
2579 .name = "IO-APIC", 2515 .name = "IO-APIC",
2580 .irq_startup = startup_ioapic_irq, 2516 .irq_startup = startup_ioapic_irq,
@@ -2582,7 +2518,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
2582 .irq_unmask = unmask_ioapic_irq, 2518 .irq_unmask = unmask_ioapic_irq,
2583 .irq_ack = ack_apic_edge, 2519 .irq_ack = ack_apic_edge,
2584 .irq_eoi = ack_apic_level, 2520 .irq_eoi = ack_apic_level,
2585 .irq_set_affinity = ioapic_set_affinity, 2521 .irq_set_affinity = native_ioapic_set_affinity,
2586 .irq_retrigger = ioapic_retrigger_irq, 2522 .irq_retrigger = ioapic_retrigger_irq,
2587}; 2523};
2588 2524
@@ -2781,8 +2717,7 @@ static inline void __init check_timer(void)
2781 * 8259A. 2717 * 8259A.
2782 */ 2718 */
2783 if (pin1 == -1) { 2719 if (pin1 == -1) {
2784 if (irq_remapping_enabled) 2720 panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC");
2785 panic("BIOS bug: timer not connected to IO-APIC");
2786 pin1 = pin2; 2721 pin1 = pin2;
2787 apic1 = apic2; 2722 apic1 = apic2;
2788 no_pin1 = 1; 2723 no_pin1 = 1;
@@ -2814,8 +2749,7 @@ static inline void __init check_timer(void)
2814 clear_IO_APIC_pin(0, pin1); 2749 clear_IO_APIC_pin(0, pin1);
2815 goto out; 2750 goto out;
2816 } 2751 }
2817 if (irq_remapping_enabled) 2752 panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
2818 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2819 local_irq_disable(); 2753 local_irq_disable();
2820 clear_IO_APIC_pin(apic1, pin1); 2754 clear_IO_APIC_pin(apic1, pin1);
2821 if (!no_pin1) 2755 if (!no_pin1)
@@ -2982,37 +2916,58 @@ device_initcall(ioapic_init_ops);
2982/* 2916/*
2983 * Dynamic irq allocate and deallocation 2917 * Dynamic irq allocate and deallocation
2984 */ 2918 */
2985unsigned int create_irq_nr(unsigned int from, int node) 2919unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
2986{ 2920{
2987 struct irq_cfg *cfg; 2921 struct irq_cfg **cfg;
2988 unsigned long flags; 2922 unsigned long flags;
2989 unsigned int ret = 0; 2923 int irq, i;
2990 int irq;
2991 2924
2992 if (from < nr_irqs_gsi) 2925 if (from < nr_irqs_gsi)
2993 from = nr_irqs_gsi; 2926 from = nr_irqs_gsi;
2994 2927
2995 irq = alloc_irq_from(from, node); 2928 cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
2996 if (irq < 0) 2929 if (!cfg)
2997 return 0;
2998 cfg = alloc_irq_cfg(irq, node);
2999 if (!cfg) {
3000 free_irq_at(irq, NULL);
3001 return 0; 2930 return 0;
2931
2932 irq = alloc_irqs_from(from, count, node);
2933 if (irq < 0)
2934 goto out_cfgs;
2935
2936 for (i = 0; i < count; i++) {
2937 cfg[i] = alloc_irq_cfg(irq + i, node);
2938 if (!cfg[i])
2939 goto out_irqs;
3002 } 2940 }
3003 2941
3004 raw_spin_lock_irqsave(&vector_lock, flags); 2942 raw_spin_lock_irqsave(&vector_lock, flags);
3005 if (!__assign_irq_vector(irq, cfg, apic->target_cpus())) 2943 for (i = 0; i < count; i++)
3006 ret = irq; 2944 if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
2945 goto out_vecs;
3007 raw_spin_unlock_irqrestore(&vector_lock, flags); 2946 raw_spin_unlock_irqrestore(&vector_lock, flags);
3008 2947
3009 if (ret) { 2948 for (i = 0; i < count; i++) {
3010 irq_set_chip_data(irq, cfg); 2949 irq_set_chip_data(irq + i, cfg[i]);
3011 irq_clear_status_flags(irq, IRQ_NOREQUEST); 2950 irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
3012 } else {
3013 free_irq_at(irq, cfg);
3014 } 2951 }
3015 return ret; 2952
2953 kfree(cfg);
2954 return irq;
2955
2956out_vecs:
2957 for (i--; i >= 0; i--)
2958 __clear_irq_vector(irq + i, cfg[i]);
2959 raw_spin_unlock_irqrestore(&vector_lock, flags);
2960out_irqs:
2961 for (i = 0; i < count; i++)
2962 free_irq_at(irq + i, cfg[i]);
2963out_cfgs:
2964 kfree(cfg);
2965 return 0;
2966}
2967
2968unsigned int create_irq_nr(unsigned int from, int node)
2969{
2970 return __create_irqs(from, 1, node);
3016} 2971}
3017 2972
3018int create_irq(void) 2973int create_irq(void)
@@ -3037,48 +2992,35 @@ void destroy_irq(unsigned int irq)
3037 2992
3038 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 2993 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
3039 2994
3040 if (irq_remapped(cfg)) 2995 free_remapped_irq(irq);
3041 free_remapped_irq(irq); 2996
3042 raw_spin_lock_irqsave(&vector_lock, flags); 2997 raw_spin_lock_irqsave(&vector_lock, flags);
3043 __clear_irq_vector(irq, cfg); 2998 __clear_irq_vector(irq, cfg);
3044 raw_spin_unlock_irqrestore(&vector_lock, flags); 2999 raw_spin_unlock_irqrestore(&vector_lock, flags);
3045 free_irq_at(irq, cfg); 3000 free_irq_at(irq, cfg);
3046} 3001}
3047 3002
3003void destroy_irqs(unsigned int irq, unsigned int count)
3004{
3005 unsigned int i;
3006
3007 for (i = 0; i < count; i++)
3008 destroy_irq(irq + i);
3009}
3010
3048/* 3011/*
3049 * MSI message composition 3012 * MSI message composition
3050 */ 3013 */
3051#ifdef CONFIG_PCI_MSI 3014void native_compose_msi_msg(struct pci_dev *pdev,
3052static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, 3015 unsigned int irq, unsigned int dest,
3053 struct msi_msg *msg, u8 hpet_id) 3016 struct msi_msg *msg, u8 hpet_id)
3054{ 3017{
3055 struct irq_cfg *cfg; 3018 struct irq_cfg *cfg = irq_cfg(irq);
3056 int err;
3057 unsigned dest;
3058
3059 if (disable_apic)
3060 return -ENXIO;
3061
3062 cfg = irq_cfg(irq);
3063 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3064 if (err)
3065 return err;
3066 3019
3067 err = apic->cpu_mask_to_apicid_and(cfg->domain, 3020 msg->address_hi = MSI_ADDR_BASE_HI;
3068 apic->target_cpus(), &dest);
3069 if (err)
3070 return err;
3071
3072 if (irq_remapped(cfg)) {
3073 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id);
3074 return err;
3075 }
3076 3021
3077 if (x2apic_enabled()) 3022 if (x2apic_enabled())
3078 msg->address_hi = MSI_ADDR_BASE_HI | 3023 msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
3079 MSI_ADDR_EXT_DEST_ID(dest);
3080 else
3081 msg->address_hi = MSI_ADDR_BASE_HI;
3082 3024
3083 msg->address_lo = 3025 msg->address_lo =
3084 MSI_ADDR_BASE_LO | 3026 MSI_ADDR_BASE_LO |
@@ -3097,8 +3039,32 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3097 MSI_DATA_DELIVERY_FIXED: 3039 MSI_DATA_DELIVERY_FIXED:
3098 MSI_DATA_DELIVERY_LOWPRI) | 3040 MSI_DATA_DELIVERY_LOWPRI) |
3099 MSI_DATA_VECTOR(cfg->vector); 3041 MSI_DATA_VECTOR(cfg->vector);
3042}
3100 3043
3101 return err; 3044#ifdef CONFIG_PCI_MSI
3045static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3046 struct msi_msg *msg, u8 hpet_id)
3047{
3048 struct irq_cfg *cfg;
3049 int err;
3050 unsigned dest;
3051
3052 if (disable_apic)
3053 return -ENXIO;
3054
3055 cfg = irq_cfg(irq);
3056 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3057 if (err)
3058 return err;
3059
3060 err = apic->cpu_mask_to_apicid_and(cfg->domain,
3061 apic->target_cpus(), &dest);
3062 if (err)
3063 return err;
3064
3065 x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
3066
3067 return 0;
3102} 3068}
3103 3069
3104static int 3070static int
@@ -3136,23 +3102,28 @@ static struct irq_chip msi_chip = {
3136 .irq_retrigger = ioapic_retrigger_irq, 3102 .irq_retrigger = ioapic_retrigger_irq,
3137}; 3103};
3138 3104
3139static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3105int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
3106 unsigned int irq_base, unsigned int irq_offset)
3140{ 3107{
3141 struct irq_chip *chip = &msi_chip; 3108 struct irq_chip *chip = &msi_chip;
3142 struct msi_msg msg; 3109 struct msi_msg msg;
3110 unsigned int irq = irq_base + irq_offset;
3143 int ret; 3111 int ret;
3144 3112
3145 ret = msi_compose_msg(dev, irq, &msg, -1); 3113 ret = msi_compose_msg(dev, irq, &msg, -1);
3146 if (ret < 0) 3114 if (ret < 0)
3147 return ret; 3115 return ret;
3148 3116
3149 irq_set_msi_desc(irq, msidesc); 3117 irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
3150 write_msi_msg(irq, &msg);
3151 3118
3152 if (irq_remapped(irq_get_chip_data(irq))) { 3119 /*
3153 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3120 * MSI-X message is written per-IRQ, the offset is always 0.
3154 irq_remap_modify_chip_defaults(chip); 3121 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
3155 } 3122 */
3123 if (!irq_offset)
3124 write_msi_msg(irq, &msg);
3125
3126 setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
3156 3127
3157 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); 3128 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3158 3129
@@ -3163,46 +3134,26 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3163 3134
3164int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3135int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3165{ 3136{
3166 int node, ret, sub_handle, index = 0;
3167 unsigned int irq, irq_want; 3137 unsigned int irq, irq_want;
3168 struct msi_desc *msidesc; 3138 struct msi_desc *msidesc;
3139 int node, ret;
3169 3140
3170 /* x86 doesn't support multiple MSI yet */ 3141 /* Multiple MSI vectors only supported with interrupt remapping */
3171 if (type == PCI_CAP_ID_MSI && nvec > 1) 3142 if (type == PCI_CAP_ID_MSI && nvec > 1)
3172 return 1; 3143 return 1;
3173 3144
3174 node = dev_to_node(&dev->dev); 3145 node = dev_to_node(&dev->dev);
3175 irq_want = nr_irqs_gsi; 3146 irq_want = nr_irqs_gsi;
3176 sub_handle = 0;
3177 list_for_each_entry(msidesc, &dev->msi_list, list) { 3147 list_for_each_entry(msidesc, &dev->msi_list, list) {
3178 irq = create_irq_nr(irq_want, node); 3148 irq = create_irq_nr(irq_want, node);
3179 if (irq == 0) 3149 if (irq == 0)
3180 return -1; 3150 return -ENOSPC;
3151
3181 irq_want = irq + 1; 3152 irq_want = irq + 1;
3182 if (!irq_remapping_enabled)
3183 goto no_ir;
3184 3153
3185 if (!sub_handle) { 3154 ret = setup_msi_irq(dev, msidesc, irq, 0);
3186 /*
3187 * allocate the consecutive block of IRTE's
3188 * for 'nvec'
3189 */
3190 index = msi_alloc_remapped_irq(dev, irq, nvec);
3191 if (index < 0) {
3192 ret = index;
3193 goto error;
3194 }
3195 } else {
3196 ret = msi_setup_remapped_irq(dev, irq, index,
3197 sub_handle);
3198 if (ret < 0)
3199 goto error;
3200 }
3201no_ir:
3202 ret = setup_msi_irq(dev, msidesc, irq);
3203 if (ret < 0) 3155 if (ret < 0)
3204 goto error; 3156 goto error;
3205 sub_handle++;
3206 } 3157 }
3207 return 0; 3158 return 0;
3208 3159
@@ -3298,26 +3249,19 @@ static struct irq_chip hpet_msi_type = {
3298 .irq_retrigger = ioapic_retrigger_irq, 3249 .irq_retrigger = ioapic_retrigger_irq,
3299}; 3250};
3300 3251
3301int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3252int default_setup_hpet_msi(unsigned int irq, unsigned int id)
3302{ 3253{
3303 struct irq_chip *chip = &hpet_msi_type; 3254 struct irq_chip *chip = &hpet_msi_type;
3304 struct msi_msg msg; 3255 struct msi_msg msg;
3305 int ret; 3256 int ret;
3306 3257
3307 if (irq_remapping_enabled) {
3308 ret = setup_hpet_msi_remapped(irq, id);
3309 if (ret)
3310 return ret;
3311 }
3312
3313 ret = msi_compose_msg(NULL, irq, &msg, id); 3258 ret = msi_compose_msg(NULL, irq, &msg, id);
3314 if (ret < 0) 3259 if (ret < 0)
3315 return ret; 3260 return ret;
3316 3261
3317 hpet_msi_write(irq_get_handler_data(irq), &msg); 3262 hpet_msi_write(irq_get_handler_data(irq), &msg);
3318 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); 3263 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3319 if (irq_remapped(irq_get_chip_data(irq))) 3264 setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
3320 irq_remap_modify_chip_defaults(chip);
3321 3265
3322 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); 3266 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3323 return 0; 3267 return 0;
@@ -3683,10 +3627,7 @@ void __init setup_ioapic_dest(void)
3683 else 3627 else
3684 mask = apic->target_cpus(); 3628 mask = apic->target_cpus();
3685 3629
3686 if (irq_remapping_enabled) 3630 x86_io_apic_ops.set_affinity(idata, mask, false);
3687 set_remapped_irq_affinity(idata, mask, false);
3688 else
3689 ioapic_set_affinity(idata, mask, false);
3690 } 3631 }
3691 3632
3692} 3633}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index cce91bf26676..7434d8556d09 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -106,7 +106,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
106 unsigned long mask = cpumask_bits(cpumask)[0]; 106 unsigned long mask = cpumask_bits(cpumask)[0];
107 unsigned long flags; 107 unsigned long flags;
108 108
109 if (WARN_ONCE(!mask, "empty IPI mask")) 109 if (!mask)
110 return; 110 return;
111 111
112 local_irq_save(flags); 112 local_irq_save(flags);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index e03a1e180e81..562a76d433c8 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -20,18 +20,19 @@ static int set_x2apic_phys_mode(char *arg)
20} 20}
21early_param("x2apic_phys", set_x2apic_phys_mode); 21early_param("x2apic_phys", set_x2apic_phys_mode);
22 22
23static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 23static bool x2apic_fadt_phys(void)
24{ 24{
25 if (x2apic_phys) 25 if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
26 return x2apic_enabled(); 26 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
27 else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
28 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) &&
29 x2apic_enabled()) {
30 printk(KERN_DEBUG "System requires x2apic physical mode\n"); 27 printk(KERN_DEBUG "System requires x2apic physical mode\n");
31 return 1; 28 return true;
32 } 29 }
33 else 30 return false;
34 return 0; 31}
32
33static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
34{
35 return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
35} 36}
36 37
37static void 38static void
@@ -82,7 +83,7 @@ static void init_x2apic_ldr(void)
82 83
83static int x2apic_phys_probe(void) 84static int x2apic_phys_probe(void)
84{ 85{
85 if (x2apic_mode && x2apic_phys) 86 if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
86 return 1; 87 return 1;
87 88
88 return apic == &apic_x2apic_phys; 89 return apic == &apic_x2apic_phys;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8cfade9510a4..794f6eb54cd3 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -91,10 +91,16 @@ static int __init early_get_pnodeid(void)
91 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); 91 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
92 uv_min_hub_revision_id = node_id.s.revision; 92 uv_min_hub_revision_id = node_id.s.revision;
93 93
94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER) 94 switch (node_id.s.part_number) {
95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; 95 case UV2_HUB_PART_NUMBER:
96 if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X) 96 case UV2_HUB_PART_NUMBER_X:
97 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; 97 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
98 break;
99 case UV3_HUB_PART_NUMBER:
100 case UV3_HUB_PART_NUMBER_X:
101 uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1;
102 break;
103 }
98 104
99 uv_hub_info->hub_revision = uv_min_hub_revision_id; 105 uv_hub_info->hub_revision = uv_min_hub_revision_id;
100 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); 106 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
@@ -130,13 +136,16 @@ static void __init uv_set_apicid_hibit(void)
130 136
131static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 137static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
132{ 138{
133 int pnodeid, is_uv1, is_uv2; 139 int pnodeid, is_uv1, is_uv2, is_uv3;
134 140
135 is_uv1 = !strcmp(oem_id, "SGI"); 141 is_uv1 = !strcmp(oem_id, "SGI");
136 is_uv2 = !strcmp(oem_id, "SGI2"); 142 is_uv2 = !strcmp(oem_id, "SGI2");
137 if (is_uv1 || is_uv2) { 143 is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */
144 if (is_uv1 || is_uv2 || is_uv3) {
138 uv_hub_info->hub_revision = 145 uv_hub_info->hub_revision =
139 is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE; 146 (is_uv1 ? UV1_HUB_REVISION_BASE :
147 (is_uv2 ? UV2_HUB_REVISION_BASE :
148 UV3_HUB_REVISION_BASE));
140 pnodeid = early_get_pnodeid(); 149 pnodeid = early_get_pnodeid();
141 early_get_apic_pnode_shift(); 150 early_get_apic_pnode_shift();
142 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 151 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
@@ -450,14 +459,17 @@ static __init void map_high(char *id, unsigned long base, int pshift,
450 459
451 paddr = base << pshift; 460 paddr = base << pshift;
452 bytes = (1UL << bshift) * (max_pnode + 1); 461 bytes = (1UL << bshift) * (max_pnode + 1);
453 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 462 if (!paddr) {
454 paddr + bytes); 463 pr_info("UV: Map %s_HI base address NULL\n", id);
464 return;
465 }
466 pr_info("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes);
455 if (map_type == map_uc) 467 if (map_type == map_uc)
456 init_extra_mapping_uc(paddr, bytes); 468 init_extra_mapping_uc(paddr, bytes);
457 else 469 else
458 init_extra_mapping_wb(paddr, bytes); 470 init_extra_mapping_wb(paddr, bytes);
459
460} 471}
472
461static __init void map_gru_high(int max_pnode) 473static __init void map_gru_high(int max_pnode)
462{ 474{
463 union uvh_rh_gam_gru_overlay_config_mmr_u gru; 475 union uvh_rh_gam_gru_overlay_config_mmr_u gru;
@@ -468,7 +480,8 @@ static __init void map_gru_high(int max_pnode)
468 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); 480 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
469 gru_start_paddr = ((u64)gru.s.base << shift); 481 gru_start_paddr = ((u64)gru.s.base << shift);
470 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); 482 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
471 483 } else {
484 pr_info("UV: GRU disabled\n");
472 } 485 }
473} 486}
474 487
@@ -480,23 +493,146 @@ static __init void map_mmr_high(int max_pnode)
480 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 493 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
481 if (mmr.s.enable) 494 if (mmr.s.enable)
482 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); 495 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
496 else
497 pr_info("UV: MMR disabled\n");
498}
499
500/*
501 * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY
502 * and REDIRECT MMR regs are exactly the same on UV3.
503 */
504struct mmioh_config {
505 unsigned long overlay;
506 unsigned long redirect;
507 char *id;
508};
509
510static __initdata struct mmioh_config mmiohs[] = {
511 {
512 UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR,
513 UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR,
514 "MMIOH0"
515 },
516 {
517 UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR,
518 UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR,
519 "MMIOH1"
520 },
521};
522
523static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode)
524{
525 union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay;
526 unsigned long mmr;
527 unsigned long base;
528 int i, n, shift, m_io, max_io;
529 int nasid, lnasid, fi, li;
530 char *id;
531
532 id = mmiohs[index].id;
533 overlay.v = uv_read_local_mmr(mmiohs[index].overlay);
534 pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n",
535 id, overlay.v, overlay.s3.base, overlay.s3.m_io);
536 if (!overlay.s3.enable) {
537 pr_info("UV: %s disabled\n", id);
538 return;
539 }
540
541 shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT;
542 base = (unsigned long)overlay.s3.base;
543 m_io = overlay.s3.m_io;
544 mmr = mmiohs[index].redirect;
545 n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH;
546 min_pnode *= 2; /* convert to NASID */
547 max_pnode *= 2;
548 max_io = lnasid = fi = li = -1;
549
550 for (i = 0; i < n; i++) {
551 union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect;
552
553 redirect.v = uv_read_local_mmr(mmr + i * 8);
554 nasid = redirect.s3.nasid;
555 if (nasid < min_pnode || max_pnode < nasid)
556 nasid = -1; /* invalid NASID */
557
558 if (nasid == lnasid) {
559 li = i;
560 if (i != n-1) /* last entry check */
561 continue;
562 }
563
564 /* check if we have a cached (or last) redirect to print */
565 if (lnasid != -1 || (i == n-1 && nasid != -1)) {
566 unsigned long addr1, addr2;
567 int f, l;
568
569 if (lnasid == -1) {
570 f = l = i;
571 lnasid = nasid;
572 } else {
573 f = fi;
574 l = li;
575 }
576 addr1 = (base << shift) +
577 f * (unsigned long)(1 << m_io);
578 addr2 = (base << shift) +
579 (l + 1) * (unsigned long)(1 << m_io);
580 pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n",
581 id, fi, li, lnasid, addr1, addr2);
582 if (max_io < l)
583 max_io = l;
584 }
585 fi = li = i;
586 lnasid = nasid;
587 }
588
589 pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n",
590 id, base, shift, m_io, max_io);
591
592 if (max_io >= 0)
593 map_high(id, base, shift, m_io, max_io, map_uc);
483} 594}
484 595
485static __init void map_mmioh_high(int max_pnode) 596static __init void map_mmioh_high(int min_pnode, int max_pnode)
486{ 597{
487 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 598 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
488 int shift; 599 unsigned long mmr, base;
600 int shift, enable, m_io, n_io;
489 601
490 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 602 if (is_uv3_hub()) {
491 if (is_uv1_hub() && mmioh.s1.enable) { 603 /* Map both MMIOH Regions */
492 shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; 604 map_mmioh_high_uv3(0, min_pnode, max_pnode);
493 map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io, 605 map_mmioh_high_uv3(1, min_pnode, max_pnode);
494 max_pnode, map_uc); 606 return;
495 } 607 }
496 if (is_uv2_hub() && mmioh.s2.enable) { 608
609 if (is_uv1_hub()) {
610 mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
611 shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
612 mmioh.v = uv_read_local_mmr(mmr);
613 enable = !!mmioh.s1.enable;
614 base = mmioh.s1.base;
615 m_io = mmioh.s1.m_io;
616 n_io = mmioh.s1.n_io;
617 } else if (is_uv2_hub()) {
618 mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR;
497 shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; 619 shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
498 map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io, 620 mmioh.v = uv_read_local_mmr(mmr);
499 max_pnode, map_uc); 621 enable = !!mmioh.s2.enable;
622 base = mmioh.s2.base;
623 m_io = mmioh.s2.m_io;
624 n_io = mmioh.s2.n_io;
625 } else
626 return;
627
628 if (enable) {
629 max_pnode &= (1 << n_io) - 1;
630 pr_info(
631 "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n",
632 base, shift, m_io, n_io, max_pnode);
633 map_high("MMIOH", base, shift, m_io, max_pnode, map_uc);
634 } else {
635 pr_info("UV: MMIOH disabled\n");
500 } 636 }
501} 637}
502 638
@@ -724,42 +860,41 @@ void uv_nmi_init(void)
724void __init uv_system_init(void) 860void __init uv_system_init(void)
725{ 861{
726 union uvh_rh_gam_config_mmr_u m_n_config; 862 union uvh_rh_gam_config_mmr_u m_n_config;
727 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
728 union uvh_node_id_u node_id; 863 union uvh_node_id_u node_id;
729 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 864 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
730 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io; 865 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
731 int gnode_extra, max_pnode = 0; 866 int gnode_extra, min_pnode = 999999, max_pnode = -1;
732 unsigned long mmr_base, present, paddr; 867 unsigned long mmr_base, present, paddr;
733 unsigned short pnode_mask, pnode_io_mask; 868 unsigned short pnode_mask;
869 char *hub = (is_uv1_hub() ? "UV1" :
870 (is_uv2_hub() ? "UV2" :
871 "UV3"));
734 872
735 printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2"); 873 pr_info("UV: Found %s hub\n", hub);
736 map_low_mmrs(); 874 map_low_mmrs();
737 875
738 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); 876 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
739 m_val = m_n_config.s.m_skt; 877 m_val = m_n_config.s.m_skt;
740 n_val = m_n_config.s.n_skt; 878 n_val = m_n_config.s.n_skt;
741 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 879 pnode_mask = (1 << n_val) - 1;
742 n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
743 mmr_base = 880 mmr_base =
744 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 881 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
745 ~UV_MMR_ENABLE; 882 ~UV_MMR_ENABLE;
746 pnode_mask = (1 << n_val) - 1;
747 pnode_io_mask = (1 << n_io) - 1;
748 883
749 node_id.v = uv_read_local_mmr(UVH_NODE_ID); 884 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
750 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; 885 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
751 gnode_upper = ((unsigned long)gnode_extra << m_val); 886 gnode_upper = ((unsigned long)gnode_extra << m_val);
752 printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n", 887 pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n",
753 n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask); 888 n_val, m_val, pnode_mask, gnode_upper, gnode_extra);
754 889
755 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 890 pr_info("UV: global MMR base 0x%lx\n", mmr_base);
756 891
757 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 892 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
758 uv_possible_blades += 893 uv_possible_blades +=
759 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); 894 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
760 895
761 /* uv_num_possible_blades() is really the hub count */ 896 /* uv_num_possible_blades() is really the hub count */
762 printk(KERN_INFO "UV: Found %d blades, %d hubs\n", 897 pr_info("UV: Found %d blades, %d hubs\n",
763 is_uv1_hub() ? uv_num_possible_blades() : 898 is_uv1_hub() ? uv_num_possible_blades() :
764 (uv_num_possible_blades() + 1) / 2, 899 (uv_num_possible_blades() + 1) / 2,
765 uv_num_possible_blades()); 900 uv_num_possible_blades());
@@ -794,6 +929,7 @@ void __init uv_system_init(void)
794 uv_blade_info[blade].nr_possible_cpus = 0; 929 uv_blade_info[blade].nr_possible_cpus = 0;
795 uv_blade_info[blade].nr_online_cpus = 0; 930 uv_blade_info[blade].nr_online_cpus = 0;
796 spin_lock_init(&uv_blade_info[blade].nmi_lock); 931 spin_lock_init(&uv_blade_info[blade].nmi_lock);
932 min_pnode = min(pnode, min_pnode);
797 max_pnode = max(pnode, max_pnode); 933 max_pnode = max(pnode, max_pnode);
798 blade++; 934 blade++;
799 } 935 }
@@ -856,7 +992,7 @@ void __init uv_system_init(void)
856 992
857 map_gru_high(max_pnode); 993 map_gru_high(max_pnode);
858 map_mmr_high(max_pnode); 994 map_mmr_high(max_pnode);
859 map_mmioh_high(max_pnode & pnode_io_mask); 995 map_mmioh_high(min_pnode, max_pnode);
860 996
861 uv_cpu_init(); 997 uv_cpu_init();
862 uv_scir_register_cpu_notifier(); 998 uv_scir_register_cpu_notifier();
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index d65464e43503..66b5faffe14a 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -232,6 +232,7 @@
232#include <linux/acpi.h> 232#include <linux/acpi.h>
233#include <linux/syscore_ops.h> 233#include <linux/syscore_ops.h>
234#include <linux/i8253.h> 234#include <linux/i8253.h>
235#include <linux/cpuidle.h>
235 236
236#include <asm/uaccess.h> 237#include <asm/uaccess.h>
237#include <asm/desc.h> 238#include <asm/desc.h>
@@ -360,13 +361,35 @@ struct apm_user {
360 * idle percentage above which bios idle calls are done 361 * idle percentage above which bios idle calls are done
361 */ 362 */
362#ifdef CONFIG_APM_CPU_IDLE 363#ifdef CONFIG_APM_CPU_IDLE
363#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
364#define DEFAULT_IDLE_THRESHOLD 95 364#define DEFAULT_IDLE_THRESHOLD 95
365#else 365#else
366#define DEFAULT_IDLE_THRESHOLD 100 366#define DEFAULT_IDLE_THRESHOLD 100
367#endif 367#endif
368#define DEFAULT_IDLE_PERIOD (100 / 3) 368#define DEFAULT_IDLE_PERIOD (100 / 3)
369 369
370static int apm_cpu_idle(struct cpuidle_device *dev,
371 struct cpuidle_driver *drv, int index);
372
373static struct cpuidle_driver apm_idle_driver = {
374 .name = "apm_idle",
375 .owner = THIS_MODULE,
376 .en_core_tk_irqen = 1,
377 .states = {
378 { /* entry 0 is for polling */ },
379 { /* entry 1 is for APM idle */
380 .name = "APM",
381 .desc = "APM idle",
382 .flags = CPUIDLE_FLAG_TIME_VALID,
383 .exit_latency = 250, /* WAG */
384 .target_residency = 500, /* WAG */
385 .enter = &apm_cpu_idle
386 },
387 },
388 .state_count = 2,
389};
390
391static struct cpuidle_device apm_cpuidle_device;
392
370/* 393/*
371 * Local variables 394 * Local variables
372 */ 395 */
@@ -377,7 +400,6 @@ static struct {
377static int clock_slowed; 400static int clock_slowed;
378static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; 401static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
379static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; 402static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
380static int set_pm_idle;
381static int suspends_pending; 403static int suspends_pending;
382static int standbys_pending; 404static int standbys_pending;
383static int ignore_sys_suspend; 405static int ignore_sys_suspend;
@@ -884,8 +906,6 @@ static void apm_do_busy(void)
884#define IDLE_CALC_LIMIT (HZ * 100) 906#define IDLE_CALC_LIMIT (HZ * 100)
885#define IDLE_LEAKY_MAX 16 907#define IDLE_LEAKY_MAX 16
886 908
887static void (*original_pm_idle)(void) __read_mostly;
888
889/** 909/**
890 * apm_cpu_idle - cpu idling for APM capable Linux 910 * apm_cpu_idle - cpu idling for APM capable Linux
891 * 911 *
@@ -894,35 +914,36 @@ static void (*original_pm_idle)(void) __read_mostly;
894 * Furthermore it calls the system default idle routine. 914 * Furthermore it calls the system default idle routine.
895 */ 915 */
896 916
897static void apm_cpu_idle(void) 917static int apm_cpu_idle(struct cpuidle_device *dev,
918 struct cpuidle_driver *drv, int index)
898{ 919{
899 static int use_apm_idle; /* = 0 */ 920 static int use_apm_idle; /* = 0 */
900 static unsigned int last_jiffies; /* = 0 */ 921 static unsigned int last_jiffies; /* = 0 */
901 static unsigned int last_stime; /* = 0 */ 922 static unsigned int last_stime; /* = 0 */
923 cputime_t stime;
902 924
903 int apm_idle_done = 0; 925 int apm_idle_done = 0;
904 unsigned int jiffies_since_last_check = jiffies - last_jiffies; 926 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
905 unsigned int bucket; 927 unsigned int bucket;
906 928
907 WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
908recalc: 929recalc:
930 task_cputime(current, NULL, &stime);
909 if (jiffies_since_last_check > IDLE_CALC_LIMIT) { 931 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
910 use_apm_idle = 0; 932 use_apm_idle = 0;
911 last_jiffies = jiffies;
912 last_stime = current->stime;
913 } else if (jiffies_since_last_check > idle_period) { 933 } else if (jiffies_since_last_check > idle_period) {
914 unsigned int idle_percentage; 934 unsigned int idle_percentage;
915 935
916 idle_percentage = current->stime - last_stime; 936 idle_percentage = stime - last_stime;
917 idle_percentage *= 100; 937 idle_percentage *= 100;
918 idle_percentage /= jiffies_since_last_check; 938 idle_percentage /= jiffies_since_last_check;
919 use_apm_idle = (idle_percentage > idle_threshold); 939 use_apm_idle = (idle_percentage > idle_threshold);
920 if (apm_info.forbid_idle) 940 if (apm_info.forbid_idle)
921 use_apm_idle = 0; 941 use_apm_idle = 0;
922 last_jiffies = jiffies;
923 last_stime = current->stime;
924 } 942 }
925 943
944 last_jiffies = jiffies;
945 last_stime = stime;
946
926 bucket = IDLE_LEAKY_MAX; 947 bucket = IDLE_LEAKY_MAX;
927 948
928 while (!need_resched()) { 949 while (!need_resched()) {
@@ -950,10 +971,7 @@ recalc:
950 break; 971 break;
951 } 972 }
952 } 973 }
953 if (original_pm_idle) 974 default_idle();
954 original_pm_idle();
955 else
956 default_idle();
957 local_irq_disable(); 975 local_irq_disable();
958 jiffies_since_last_check = jiffies - last_jiffies; 976 jiffies_since_last_check = jiffies - last_jiffies;
959 if (jiffies_since_last_check > idle_period) 977 if (jiffies_since_last_check > idle_period)
@@ -963,7 +981,7 @@ recalc:
963 if (apm_idle_done) 981 if (apm_idle_done)
964 apm_do_busy(); 982 apm_do_busy();
965 983
966 local_irq_enable(); 984 return index;
967} 985}
968 986
969/** 987/**
@@ -2381,9 +2399,9 @@ static int __init apm_init(void)
2381 if (HZ != 100) 2399 if (HZ != 100)
2382 idle_period = (idle_period * HZ) / 100; 2400 idle_period = (idle_period * HZ) / 100;
2383 if (idle_threshold < 100) { 2401 if (idle_threshold < 100) {
2384 original_pm_idle = pm_idle; 2402 if (!cpuidle_register_driver(&apm_idle_driver))
2385 pm_idle = apm_cpu_idle; 2403 if (cpuidle_register_device(&apm_cpuidle_device))
2386 set_pm_idle = 1; 2404 cpuidle_unregister_driver(&apm_idle_driver);
2387 } 2405 }
2388 2406
2389 return 0; 2407 return 0;
@@ -2393,15 +2411,9 @@ static void __exit apm_exit(void)
2393{ 2411{
2394 int error; 2412 int error;
2395 2413
2396 if (set_pm_idle) { 2414 cpuidle_unregister_device(&apm_cpuidle_device);
2397 pm_idle = original_pm_idle; 2415 cpuidle_unregister_driver(&apm_idle_driver);
2398 /* 2416
2399 * We are about to unload the current idle thread pm callback
2400 * (pm_idle), Wait for all processors to update cached/local
2401 * copies of pm_idle before proceeding.
2402 */
2403 kick_all_cpus_sync();
2404 }
2405 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) 2417 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
2406 && (apm_info.connection_version > 0x0100)) { 2418 && (apm_info.connection_version > 0x0100)) {
2407 error = apm_engage_power_management(APM_DEVICE_ALL, 0); 2419 error = apm_engage_power_management(APM_DEVICE_ALL, 0);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15239fffd6fe..fa96eb0d02fb 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
12#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
13 13
14#ifdef CONFIG_X86_64 14#ifdef CONFIG_X86_64
15# include <asm/numa_64.h>
16# include <asm/mmconfig.h> 15# include <asm/mmconfig.h>
17# include <asm/cacheflush.h> 16# include <asm/cacheflush.h>
18#endif 17#endif
@@ -220,8 +219,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
220 */ 219 */
221 WARN_ONCE(1, "WARNING: This combination of AMD" 220 WARN_ONCE(1, "WARNING: This combination of AMD"
222 " processors is not suitable for SMP.\n"); 221 " processors is not suitable for SMP.\n");
223 if (!test_taint(TAINT_UNSAFE_SMP)) 222 add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE);
224 add_taint(TAINT_UNSAFE_SMP);
225 223
226valid_k7: 224valid_k7:
227 ; 225 ;
@@ -364,9 +362,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
364#endif 362#endif
365} 363}
366 364
367int amd_get_nb_id(int cpu) 365u16 amd_get_nb_id(int cpu)
368{ 366{
369 int id = 0; 367 u16 id = 0;
370#ifdef CONFIG_SMP 368#ifdef CONFIG_SMP
371 id = per_cpu(cpu_llc_id, cpu); 369 id = per_cpu(cpu_llc_id, cpu);
372#endif 370#endif
@@ -518,10 +516,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
518static void __cpuinit init_amd(struct cpuinfo_x86 *c) 516static void __cpuinit init_amd(struct cpuinfo_x86 *c)
519{ 517{
520 u32 dummy; 518 u32 dummy;
521
522#ifdef CONFIG_SMP
523 unsigned long long value; 519 unsigned long long value;
524 520
521#ifdef CONFIG_SMP
525 /* 522 /*
526 * Disable TLB flush filter by setting HWCR.FFDIS on K8 523 * Disable TLB flush filter by setting HWCR.FFDIS on K8
527 * bit 6 of msr C001_0015 524 * bit 6 of msr C001_0015
@@ -559,12 +556,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
559 * (AMD Erratum #110, docId: 25759). 556 * (AMD Erratum #110, docId: 25759).
560 */ 557 */
561 if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { 558 if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
562 u64 val;
563
564 clear_cpu_cap(c, X86_FEATURE_LAHF_LM); 559 clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
565 if (!rdmsrl_amd_safe(0xc001100d, &val)) { 560 if (!rdmsrl_amd_safe(0xc001100d, &value)) {
566 val &= ~(1ULL << 32); 561 value &= ~(1ULL << 32);
567 wrmsrl_amd_safe(0xc001100d, val); 562 wrmsrl_amd_safe(0xc001100d, value);
568 } 563 }
569 } 564 }
570 565
@@ -617,13 +612,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
617 if ((c->x86 == 0x15) && 612 if ((c->x86 == 0x15) &&
618 (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && 613 (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
619 !cpu_has(c, X86_FEATURE_TOPOEXT)) { 614 !cpu_has(c, X86_FEATURE_TOPOEXT)) {
620 u64 val;
621 615
622 if (!rdmsrl_safe(0xc0011005, &val)) { 616 if (!rdmsrl_safe(0xc0011005, &value)) {
623 val |= 1ULL << 54; 617 value |= 1ULL << 54;
624 wrmsrl_safe(0xc0011005, val); 618 wrmsrl_safe(0xc0011005, value);
625 rdmsrl(0xc0011005, val); 619 rdmsrl(0xc0011005, value);
626 if (val & (1ULL << 54)) { 620 if (value & (1ULL << 54)) {
627 set_cpu_cap(c, X86_FEATURE_TOPOEXT); 621 set_cpu_cap(c, X86_FEATURE_TOPOEXT);
628 printk(KERN_INFO FW_INFO "CPU: Re-enabling " 622 printk(KERN_INFO FW_INFO "CPU: Re-enabling "
629 "disabled Topology Extensions Support\n"); 623 "disabled Topology Extensions Support\n");
@@ -637,11 +631,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
637 */ 631 */
638 if ((c->x86 == 0x15) && 632 if ((c->x86 == 0x15) &&
639 (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { 633 (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
640 u64 val;
641 634
642 if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) { 635 if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
643 val |= 0x1E; 636 value |= 0x1E;
644 wrmsrl_safe(0xc0011021, val); 637 wrmsrl_safe(0xc0011021, value);
645 } 638 }
646 } 639 }
647 640
@@ -685,12 +678,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
685 * benefit in doing so. 678 * benefit in doing so.
686 */ 679 */
687 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { 680 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
681 unsigned long pfn = tseg >> PAGE_SHIFT;
682
688 printk(KERN_DEBUG "tseg: %010llx\n", tseg); 683 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
689 if ((tseg>>PMD_SHIFT) < 684 if (pfn_range_is_mapped(pfn, pfn + 1))
690 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
691 ((tseg>>PMD_SHIFT) <
692 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
693 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
694 set_memory_4k((unsigned long)__va(tseg), 1); 685 set_memory_4k((unsigned long)__va(tseg), 1);
695 } 686 }
696 } 687 }
@@ -703,13 +694,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
703 if (c->x86 > 0x11) 694 if (c->x86 > 0x11)
704 set_cpu_cap(c, X86_FEATURE_ARAT); 695 set_cpu_cap(c, X86_FEATURE_ARAT);
705 696
706 /*
707 * Disable GART TLB Walk Errors on Fam10h. We do this here
708 * because this is always needed when GART is enabled, even in a
709 * kernel which has no MCE support built in.
710 */
711 if (c->x86 == 0x10) { 697 if (c->x86 == 0x10) {
712 /* 698 /*
699 * Disable GART TLB Walk Errors on Fam10h. We do this here
700 * because this is always needed when GART is enabled, even in a
701 * kernel which has no MCE support built in.
713 * BIOS should disable GartTlbWlk Errors themself. If 702 * BIOS should disable GartTlbWlk Errors themself. If
714 * it doesn't do it here as suggested by the BKDG. 703 * it doesn't do it here as suggested by the BKDG.
715 * 704 *
@@ -723,6 +712,21 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
723 mask |= (1 << 10); 712 mask |= (1 << 10);
724 wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); 713 wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
725 } 714 }
715
716 /*
717 * On family 10h BIOS may not have properly enabled WC+ support,
718 * causing it to be converted to CD memtype. This may result in
719 * performance degradation for certain nested-paging guests.
720 * Prevent this conversion by clearing bit 24 in
721 * MSR_AMD64_BU_CFG2.
722 *
723 * NOTE: we want to use the _safe accessors so as not to #GP kvm
724 * guests on older kvm hosts.
725 */
726
727 rdmsrl_safe(MSR_AMD64_BU_CFG2, &value);
728 value &= ~(1ULL << 24);
729 wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
726 } 730 }
727 731
728 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 732 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 92dfec986a48..af6455e3fcc9 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -17,15 +17,6 @@
17#include <asm/paravirt.h> 17#include <asm/paravirt.h>
18#include <asm/alternative.h> 18#include <asm/alternative.h>
19 19
20static int __init no_halt(char *s)
21{
22 WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
23 boot_cpu_data.hlt_works_ok = 0;
24 return 1;
25}
26
27__setup("no-hlt", no_halt);
28
29static int __init no_387(char *s) 20static int __init no_387(char *s)
30{ 21{
31 boot_cpu_data.hard_math = 0; 22 boot_cpu_data.hard_math = 0;
@@ -89,23 +80,6 @@ static void __init check_fpu(void)
89 pr_warn("Hmm, FPU with FDIV bug\n"); 80 pr_warn("Hmm, FPU with FDIV bug\n");
90} 81}
91 82
92static void __init check_hlt(void)
93{
94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
95 return;
96
97 pr_info("Checking 'hlt' instruction... ");
98 if (!boot_cpu_data.hlt_works_ok) {
99 pr_cont("disabled\n");
100 return;
101 }
102 halt();
103 halt();
104 halt();
105 halt();
106 pr_cont("OK\n");
107}
108
109/* 83/*
110 * Check whether we are able to run this kernel safely on SMP. 84 * Check whether we are able to run this kernel safely on SMP.
111 * 85 *
@@ -129,7 +103,6 @@ void __init check_bugs(void)
129 print_cpu_info(&boot_cpu_data); 103 print_cpu_info(&boot_cpu_data);
130#endif 104#endif
131 check_config(); 105 check_config();
132 check_hlt();
133 init_utsname()->machine[1] = 106 init_utsname()->machine[1] =
134 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); 107 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
135 alternative_instructions(); 108 alternative_instructions();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9c3ab43a6954..d814772c5bed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -37,6 +37,8 @@
37#include <asm/mce.h> 37#include <asm/mce.h>
38#include <asm/msr.h> 38#include <asm/msr.h>
39#include <asm/pat.h> 39#include <asm/pat.h>
40#include <asm/microcode.h>
41#include <asm/microcode_intel.h>
40 42
41#ifdef CONFIG_X86_LOCAL_APIC 43#ifdef CONFIG_X86_LOCAL_APIC
42#include <asm/uv/uv.h> 44#include <asm/uv/uv.h>
@@ -213,7 +215,7 @@ static inline int flag_is_changeable_p(u32 flag)
213} 215}
214 216
215/* Probe for the CPUID instruction */ 217/* Probe for the CPUID instruction */
216static int __cpuinit have_cpuid_p(void) 218int __cpuinit have_cpuid_p(void)
217{ 219{
218 return flag_is_changeable_p(X86_EFLAGS_ID); 220 return flag_is_changeable_p(X86_EFLAGS_ID);
219} 221}
@@ -249,11 +251,6 @@ static inline int flag_is_changeable_p(u32 flag)
249{ 251{
250 return 1; 252 return 1;
251} 253}
252/* Probe for the CPUID instruction */
253static inline int have_cpuid_p(void)
254{
255 return 1;
256}
257static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 254static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
258{ 255{
259} 256}
@@ -1223,6 +1220,12 @@ void __cpuinit cpu_init(void)
1223 int cpu; 1220 int cpu;
1224 int i; 1221 int i;
1225 1222
1223 /*
1224 * Load microcode on this cpu if a valid microcode is available.
1225 * This is early microcode loading procedure.
1226 */
1227 load_ucode_ap();
1228
1226 cpu = stack_smp_processor_id(); 1229 cpu = stack_smp_processor_id();
1227 t = &per_cpu(init_tss, cpu); 1230 t = &per_cpu(init_tss, cpu);
1228 oist = &per_cpu(orig_ist, cpu); 1231 oist = &per_cpu(orig_ist, cpu);
@@ -1314,6 +1317,8 @@ void __cpuinit cpu_init(void)
1314 struct tss_struct *t = &per_cpu(init_tss, cpu); 1317 struct tss_struct *t = &per_cpu(init_tss, cpu);
1315 struct thread_struct *thread = &curr->thread; 1318 struct thread_struct *thread = &curr->thread;
1316 1319
1320 show_ucode_info_early();
1321
1317 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { 1322 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
1318 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 1323 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
1319 for (;;) 1324 for (;;)
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index a8f8fa9769d6..1e7e84a02eba 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -79,3 +79,10 @@ void __init init_hypervisor_platform(void)
79 if (x86_hyper->init_platform) 79 if (x86_hyper->init_platform)
80 x86_hyper->init_platform(); 80 x86_hyper->init_platform();
81} 81}
82
83bool __init hypervisor_x2apic_available(void)
84{
85 return x86_hyper &&
86 x86_hyper->x2apic_available &&
87 x86_hyper->x2apic_available();
88}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcaabd0432c5..1905ce98bee0 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
17 17
18#ifdef CONFIG_X86_64 18#ifdef CONFIG_X86_64
19#include <linux/topology.h> 19#include <linux/topology.h>
20#include <asm/numa_64.h>
21#endif 20#endif
22 21
23#include "cpu.h" 22#include "cpu.h"
@@ -168,7 +167,7 @@ int __cpuinit ppro_with_ram_bug(void)
168#ifdef CONFIG_X86_F00F_BUG 167#ifdef CONFIG_X86_F00F_BUG
169static void __cpuinit trap_init_f00f_bug(void) 168static void __cpuinit trap_init_f00f_bug(void)
170{ 169{
171 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 170 __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
172 171
173 /* 172 /*
174 * Update the IDT descriptor and reload the IDT so that 173 * Update the IDT descriptor and reload the IDT so that
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fe9edec6698a..7c6f7d548c0f 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -298,8 +298,7 @@ struct _cache_attr {
298 unsigned int); 298 unsigned int);
299}; 299};
300 300
301#ifdef CONFIG_AMD_NB 301#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
302
303/* 302/*
304 * L3 cache descriptors 303 * L3 cache descriptors
305 */ 304 */
@@ -524,9 +523,9 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
524static struct _cache_attr subcaches = 523static struct _cache_attr subcaches =
525 __ATTR(subcaches, 0644, show_subcaches, store_subcaches); 524 __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
526 525
527#else /* CONFIG_AMD_NB */ 526#else
528#define amd_init_l3_cache(x, y) 527#define amd_init_l3_cache(x, y)
529#endif /* CONFIG_AMD_NB */ 528#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
530 529
531static int 530static int
532__cpuinit cpuid4_cache_lookup_regs(int index, 531__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1227,7 +1226,7 @@ static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = {
1227 .notifier_call = cacheinfo_cpu_callback, 1226 .notifier_call = cacheinfo_cpu_callback,
1228}; 1227};
1229 1228
1230static int __cpuinit cache_sysfs_init(void) 1229static int __init cache_sysfs_init(void)
1231{ 1230{
1232 int i; 1231 int i;
1233 1232
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 6a05c1d327a9..5b7d4fa5d3b7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -24,8 +24,6 @@ struct mce_bank {
24int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void); 25struct dentry *mce_get_debugfs_dir(void);
26 26
27extern int mce_ser;
28
29extern struct mce_bank *mce_banks; 27extern struct mce_bank *mce_banks;
30 28
31#ifdef CONFIG_X86_MCE_INTEL 29#ifdef CONFIG_X86_MCE_INTEL
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 13017626f9a8..beb1f1689e52 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -193,9 +193,9 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
193 continue; 193 continue;
194 if ((m->mcgstatus & s->mcgmask) != s->mcgres) 194 if ((m->mcgstatus & s->mcgmask) != s->mcgres)
195 continue; 195 continue;
196 if (s->ser == SER_REQUIRED && !mce_ser) 196 if (s->ser == SER_REQUIRED && !mca_cfg.ser)
197 continue; 197 continue;
198 if (s->ser == NO_SER && mce_ser) 198 if (s->ser == NO_SER && mca_cfg.ser)
199 continue; 199 continue;
200 if (s->context && ctx != s->context) 200 if (s->context && ctx != s->context)
201 continue; 201 continue;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 46cbf8689692..7bc126346ace 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -58,34 +58,26 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
58#define CREATE_TRACE_POINTS 58#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h> 59#include <trace/events/mce.h>
60 60
61int mce_disabled __read_mostly;
62
63#define SPINUNIT 100 /* 100ns */ 61#define SPINUNIT 100 /* 100ns */
64 62
65atomic_t mce_entry; 63atomic_t mce_entry;
66 64
67DEFINE_PER_CPU(unsigned, mce_exception_count); 65DEFINE_PER_CPU(unsigned, mce_exception_count);
68 66
69/* 67struct mce_bank *mce_banks __read_mostly;
70 * Tolerant levels: 68
71 * 0: always panic on uncorrected errors, log corrected errors 69struct mca_config mca_cfg __read_mostly = {
72 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 70 .bootlog = -1,
73 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors 71 /*
74 * 3: never panic or SIGBUS, log all errors (for testing only) 72 * Tolerant levels:
75 */ 73 * 0: always panic on uncorrected errors, log corrected errors
76static int tolerant __read_mostly = 1; 74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
77static int banks __read_mostly; 75 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
78static int rip_msr __read_mostly; 76 * 3: never panic or SIGBUS, log all errors (for testing only)
79static int mce_bootlog __read_mostly = -1; 77 */
80static int monarch_timeout __read_mostly = -1; 78 .tolerant = 1,
81static int mce_panic_timeout __read_mostly; 79 .monarch_timeout = -1
82static int mce_dont_log_ce __read_mostly; 80};
83int mce_cmci_disabled __read_mostly;
84int mce_ignore_ce __read_mostly;
85int mce_ser __read_mostly;
86int mce_bios_cmci_threshold __read_mostly;
87
88struct mce_bank *mce_banks __read_mostly;
89 81
90/* User mode helper program triggered by machine check event */ 82/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify; 83static unsigned long mce_need_notify;
@@ -302,7 +294,7 @@ static void wait_for_panic(void)
302 while (timeout-- > 0) 294 while (timeout-- > 0)
303 udelay(1); 295 udelay(1);
304 if (panic_timeout == 0) 296 if (panic_timeout == 0)
305 panic_timeout = mce_panic_timeout; 297 panic_timeout = mca_cfg.panic_timeout;
306 panic("Panicing machine check CPU died"); 298 panic("Panicing machine check CPU died");
307} 299}
308 300
@@ -360,7 +352,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
360 pr_emerg(HW_ERR "Machine check: %s\n", exp); 352 pr_emerg(HW_ERR "Machine check: %s\n", exp);
361 if (!fake_panic) { 353 if (!fake_panic) {
362 if (panic_timeout == 0) 354 if (panic_timeout == 0)
363 panic_timeout = mce_panic_timeout; 355 panic_timeout = mca_cfg.panic_timeout;
364 panic(msg); 356 panic(msg);
365 } else 357 } else
366 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 358 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -372,7 +364,7 @@ static int msr_to_offset(u32 msr)
372{ 364{
373 unsigned bank = __this_cpu_read(injectm.bank); 365 unsigned bank = __this_cpu_read(injectm.bank);
374 366
375 if (msr == rip_msr) 367 if (msr == mca_cfg.rip_msr)
376 return offsetof(struct mce, ip); 368 return offsetof(struct mce, ip);
377 if (msr == MSR_IA32_MCx_STATUS(bank)) 369 if (msr == MSR_IA32_MCx_STATUS(bank))
378 return offsetof(struct mce, status); 370 return offsetof(struct mce, status);
@@ -451,8 +443,8 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
451 m->cs |= 3; 443 m->cs |= 3;
452 } 444 }
453 /* Use accurate RIP reporting if available. */ 445 /* Use accurate RIP reporting if available. */
454 if (rip_msr) 446 if (mca_cfg.rip_msr)
455 m->ip = mce_rdmsrl(rip_msr); 447 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
456 } 448 }
457} 449}
458 450
@@ -513,18 +505,15 @@ static int mce_ring_add(unsigned long pfn)
513 505
514int mce_available(struct cpuinfo_x86 *c) 506int mce_available(struct cpuinfo_x86 *c)
515{ 507{
516 if (mce_disabled) 508 if (mca_cfg.disabled)
517 return 0; 509 return 0;
518 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 510 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
519} 511}
520 512
521static void mce_schedule_work(void) 513static void mce_schedule_work(void)
522{ 514{
523 if (!mce_ring_empty()) { 515 if (!mce_ring_empty())
524 struct work_struct *work = &__get_cpu_var(mce_work); 516 schedule_work(&__get_cpu_var(mce_work));
525 if (!work_pending(work))
526 schedule_work(work);
527 }
528} 517}
529 518
530DEFINE_PER_CPU(struct irq_work, mce_irq_work); 519DEFINE_PER_CPU(struct irq_work, mce_irq_work);
@@ -565,7 +554,7 @@ static void mce_read_aux(struct mce *m, int i)
565 /* 554 /*
566 * Mask the reported address by the reported granularity. 555 * Mask the reported address by the reported granularity.
567 */ 556 */
568 if (mce_ser && (m->status & MCI_STATUS_MISCV)) { 557 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
569 u8 shift = MCI_MISC_ADDR_LSB(m->misc); 558 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
570 m->addr >>= shift; 559 m->addr >>= shift;
571 m->addr <<= shift; 560 m->addr <<= shift;
@@ -599,7 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
599 588
600 mce_gather_info(&m, NULL); 589 mce_gather_info(&m, NULL);
601 590
602 for (i = 0; i < banks; i++) { 591 for (i = 0; i < mca_cfg.banks; i++) {
603 if (!mce_banks[i].ctl || !test_bit(i, *b)) 592 if (!mce_banks[i].ctl || !test_bit(i, *b))
604 continue; 593 continue;
605 594
@@ -620,7 +609,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
620 * TBD do the same check for MCI_STATUS_EN here? 609 * TBD do the same check for MCI_STATUS_EN here?
621 */ 610 */
622 if (!(flags & MCP_UC) && 611 if (!(flags & MCP_UC) &&
623 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) 612 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
624 continue; 613 continue;
625 614
626 mce_read_aux(&m, i); 615 mce_read_aux(&m, i);
@@ -631,7 +620,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
631 * Don't get the IP here because it's unlikely to 620 * Don't get the IP here because it's unlikely to
632 * have anything to do with the actual error location. 621 * have anything to do with the actual error location.
633 */ 622 */
634 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) 623 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
635 mce_log(&m); 624 mce_log(&m);
636 625
637 /* 626 /*
@@ -658,14 +647,14 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
658{ 647{
659 int i, ret = 0; 648 int i, ret = 0;
660 649
661 for (i = 0; i < banks; i++) { 650 for (i = 0; i < mca_cfg.banks; i++) {
662 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 651 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
663 if (m->status & MCI_STATUS_VAL) { 652 if (m->status & MCI_STATUS_VAL) {
664 __set_bit(i, validp); 653 __set_bit(i, validp);
665 if (quirk_no_way_out) 654 if (quirk_no_way_out)
666 quirk_no_way_out(i, m, regs); 655 quirk_no_way_out(i, m, regs);
667 } 656 }
668 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 657 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
669 ret = 1; 658 ret = 1;
670 } 659 }
671 return ret; 660 return ret;
@@ -696,11 +685,11 @@ static int mce_timed_out(u64 *t)
696 rmb(); 685 rmb();
697 if (atomic_read(&mce_paniced)) 686 if (atomic_read(&mce_paniced))
698 wait_for_panic(); 687 wait_for_panic();
699 if (!monarch_timeout) 688 if (!mca_cfg.monarch_timeout)
700 goto out; 689 goto out;
701 if ((s64)*t < SPINUNIT) { 690 if ((s64)*t < SPINUNIT) {
702 /* CHECKME: Make panic default for 1 too? */ 691 /* CHECKME: Make panic default for 1 too? */
703 if (tolerant < 1) 692 if (mca_cfg.tolerant < 1)
704 mce_panic("Timeout synchronizing machine check over CPUs", 693 mce_panic("Timeout synchronizing machine check over CPUs",
705 NULL, NULL); 694 NULL, NULL);
706 cpu_missing = 1; 695 cpu_missing = 1;
@@ -750,7 +739,8 @@ static void mce_reign(void)
750 * Grade the severity of the errors of all the CPUs. 739 * Grade the severity of the errors of all the CPUs.
751 */ 740 */
752 for_each_possible_cpu(cpu) { 741 for_each_possible_cpu(cpu) {
753 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, 742 int severity = mce_severity(&per_cpu(mces_seen, cpu),
743 mca_cfg.tolerant,
754 &nmsg); 744 &nmsg);
755 if (severity > global_worst) { 745 if (severity > global_worst) {
756 msg = nmsg; 746 msg = nmsg;
@@ -764,7 +754,7 @@ static void mce_reign(void)
764 * This dumps all the mces in the log buffer and stops the 754 * This dumps all the mces in the log buffer and stops the
765 * other CPUs. 755 * other CPUs.
766 */ 756 */
767 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) 757 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
768 mce_panic("Fatal Machine check", m, msg); 758 mce_panic("Fatal Machine check", m, msg);
769 759
770 /* 760 /*
@@ -777,7 +767,7 @@ static void mce_reign(void)
777 * No machine check event found. Must be some external 767 * No machine check event found. Must be some external
778 * source or one CPU is hung. Panic. 768 * source or one CPU is hung. Panic.
779 */ 769 */
780 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) 770 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
781 mce_panic("Machine check from unknown source", NULL, NULL); 771 mce_panic("Machine check from unknown source", NULL, NULL);
782 772
783 /* 773 /*
@@ -801,7 +791,7 @@ static int mce_start(int *no_way_out)
801{ 791{
802 int order; 792 int order;
803 int cpus = num_online_cpus(); 793 int cpus = num_online_cpus();
804 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 794 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
805 795
806 if (!timeout) 796 if (!timeout)
807 return -1; 797 return -1;
@@ -865,7 +855,7 @@ static int mce_start(int *no_way_out)
865static int mce_end(int order) 855static int mce_end(int order)
866{ 856{
867 int ret = -1; 857 int ret = -1;
868 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; 858 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
869 859
870 if (!timeout) 860 if (!timeout)
871 goto reset; 861 goto reset;
@@ -946,7 +936,7 @@ static void mce_clear_state(unsigned long *toclear)
946{ 936{
947 int i; 937 int i;
948 938
949 for (i = 0; i < banks; i++) { 939 for (i = 0; i < mca_cfg.banks; i++) {
950 if (test_bit(i, toclear)) 940 if (test_bit(i, toclear))
951 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 941 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
952 } 942 }
@@ -1011,6 +1001,7 @@ static void mce_clear_info(struct mce_info *mi)
1011 */ 1001 */
1012void do_machine_check(struct pt_regs *regs, long error_code) 1002void do_machine_check(struct pt_regs *regs, long error_code)
1013{ 1003{
1004 struct mca_config *cfg = &mca_cfg;
1014 struct mce m, *final; 1005 struct mce m, *final;
1015 int i; 1006 int i;
1016 int worst = 0; 1007 int worst = 0;
@@ -1022,7 +1013,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1022 int order; 1013 int order;
1023 /* 1014 /*
1024 * If no_way_out gets set, there is no safe way to recover from this 1015 * If no_way_out gets set, there is no safe way to recover from this
1025 * MCE. If tolerant is cranked up, we'll try anyway. 1016 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
1026 */ 1017 */
1027 int no_way_out = 0; 1018 int no_way_out = 0;
1028 /* 1019 /*
@@ -1038,7 +1029,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1038 1029
1039 this_cpu_inc(mce_exception_count); 1030 this_cpu_inc(mce_exception_count);
1040 1031
1041 if (!banks) 1032 if (!cfg->banks)
1042 goto out; 1033 goto out;
1043 1034
1044 mce_gather_info(&m, regs); 1035 mce_gather_info(&m, regs);
@@ -1065,7 +1056,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1065 * because the first one to see it will clear it. 1056 * because the first one to see it will clear it.
1066 */ 1057 */
1067 order = mce_start(&no_way_out); 1058 order = mce_start(&no_way_out);
1068 for (i = 0; i < banks; i++) { 1059 for (i = 0; i < cfg->banks; i++) {
1069 __clear_bit(i, toclear); 1060 __clear_bit(i, toclear);
1070 if (!test_bit(i, valid_banks)) 1061 if (!test_bit(i, valid_banks))
1071 continue; 1062 continue;
@@ -1084,16 +1075,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1084 * Non uncorrected or non signaled errors are handled by 1075 * Non uncorrected or non signaled errors are handled by
1085 * machine_check_poll. Leave them alone, unless this panics. 1076 * machine_check_poll. Leave them alone, unless this panics.
1086 */ 1077 */
1087 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 1078 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1088 !no_way_out) 1079 !no_way_out)
1089 continue; 1080 continue;
1090 1081
1091 /* 1082 /*
1092 * Set taint even when machine check was not enabled. 1083 * Set taint even when machine check was not enabled.
1093 */ 1084 */
1094 add_taint(TAINT_MACHINE_CHECK); 1085 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1095 1086
1096 severity = mce_severity(&m, tolerant, NULL); 1087 severity = mce_severity(&m, cfg->tolerant, NULL);
1097 1088
1098 /* 1089 /*
1099 * When machine check was for corrected handler don't touch, 1090 * When machine check was for corrected handler don't touch,
@@ -1117,7 +1108,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1117 * When the ring overflows we just ignore the AO error. 1108 * When the ring overflows we just ignore the AO error.
1118 * RED-PEN add some logging mechanism when 1109 * RED-PEN add some logging mechanism when
1119 * usable_address or mce_add_ring fails. 1110 * usable_address or mce_add_ring fails.
1120 * RED-PEN don't ignore overflow for tolerant == 0 1111 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1121 */ 1112 */
1122 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 1113 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1123 mce_ring_add(m.addr >> PAGE_SHIFT); 1114 mce_ring_add(m.addr >> PAGE_SHIFT);
@@ -1149,7 +1140,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1149 * issues we try to recover, or limit damage to the current 1140 * issues we try to recover, or limit damage to the current
1150 * process. 1141 * process.
1151 */ 1142 */
1152 if (tolerant < 3) { 1143 if (cfg->tolerant < 3) {
1153 if (no_way_out) 1144 if (no_way_out)
1154 mce_panic("Fatal machine check on current CPU", &m, msg); 1145 mce_panic("Fatal machine check on current CPU", &m, msg);
1155 if (worst == MCE_AR_SEVERITY) { 1146 if (worst == MCE_AR_SEVERITY) {
@@ -1357,12 +1348,7 @@ int mce_notify_irq(void)
1357 /* wake processes polling /dev/mcelog */ 1348 /* wake processes polling /dev/mcelog */
1358 wake_up_interruptible(&mce_chrdev_wait); 1349 wake_up_interruptible(&mce_chrdev_wait);
1359 1350
1360 /* 1351 if (mce_helper[0])
1361 * There is no risk of missing notifications because
1362 * work_pending is always cleared before the function is
1363 * executed.
1364 */
1365 if (mce_helper[0] && !work_pending(&mce_trigger_work))
1366 schedule_work(&mce_trigger_work); 1352 schedule_work(&mce_trigger_work);
1367 1353
1368 if (__ratelimit(&ratelimit)) 1354 if (__ratelimit(&ratelimit))
@@ -1377,11 +1363,13 @@ EXPORT_SYMBOL_GPL(mce_notify_irq);
1377static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1363static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1378{ 1364{
1379 int i; 1365 int i;
1366 u8 num_banks = mca_cfg.banks;
1380 1367
1381 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); 1368 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1382 if (!mce_banks) 1369 if (!mce_banks)
1383 return -ENOMEM; 1370 return -ENOMEM;
1384 for (i = 0; i < banks; i++) { 1371
1372 for (i = 0; i < num_banks; i++) {
1385 struct mce_bank *b = &mce_banks[i]; 1373 struct mce_bank *b = &mce_banks[i];
1386 1374
1387 b->ctl = -1ULL; 1375 b->ctl = -1ULL;
@@ -1401,7 +1389,7 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1401 rdmsrl(MSR_IA32_MCG_CAP, cap); 1389 rdmsrl(MSR_IA32_MCG_CAP, cap);
1402 1390
1403 b = cap & MCG_BANKCNT_MASK; 1391 b = cap & MCG_BANKCNT_MASK;
1404 if (!banks) 1392 if (!mca_cfg.banks)
1405 pr_info("CPU supports %d MCE banks\n", b); 1393 pr_info("CPU supports %d MCE banks\n", b);
1406 1394
1407 if (b > MAX_NR_BANKS) { 1395 if (b > MAX_NR_BANKS) {
@@ -1411,8 +1399,9 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1411 } 1399 }
1412 1400
1413 /* Don't support asymmetric configurations today */ 1401 /* Don't support asymmetric configurations today */
1414 WARN_ON(banks != 0 && b != banks); 1402 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1415 banks = b; 1403 mca_cfg.banks = b;
1404
1416 if (!mce_banks) { 1405 if (!mce_banks) {
1417 int err = __mcheck_cpu_mce_banks_init(); 1406 int err = __mcheck_cpu_mce_banks_init();
1418 1407
@@ -1422,25 +1411,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1422 1411
1423 /* Use accurate RIP reporting if available. */ 1412 /* Use accurate RIP reporting if available. */
1424 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1413 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1425 rip_msr = MSR_IA32_MCG_EIP; 1414 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1426 1415
1427 if (cap & MCG_SER_P) 1416 if (cap & MCG_SER_P)
1428 mce_ser = 1; 1417 mca_cfg.ser = true;
1429 1418
1430 return 0; 1419 return 0;
1431} 1420}
1432 1421
1433static void __mcheck_cpu_init_generic(void) 1422static void __mcheck_cpu_init_generic(void)
1434{ 1423{
1424 enum mcp_flags m_fl = 0;
1435 mce_banks_t all_banks; 1425 mce_banks_t all_banks;
1436 u64 cap; 1426 u64 cap;
1437 int i; 1427 int i;
1438 1428
1429 if (!mca_cfg.bootlog)
1430 m_fl = MCP_DONTLOG;
1431
1439 /* 1432 /*
1440 * Log the machine checks left over from the previous reset. 1433 * Log the machine checks left over from the previous reset.
1441 */ 1434 */
1442 bitmap_fill(all_banks, MAX_NR_BANKS); 1435 bitmap_fill(all_banks, MAX_NR_BANKS);
1443 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); 1436 machine_check_poll(MCP_UC | m_fl, &all_banks);
1444 1437
1445 set_in_cr4(X86_CR4_MCE); 1438 set_in_cr4(X86_CR4_MCE);
1446 1439
@@ -1448,7 +1441,7 @@ static void __mcheck_cpu_init_generic(void)
1448 if (cap & MCG_CTL_P) 1441 if (cap & MCG_CTL_P)
1449 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1442 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1450 1443
1451 for (i = 0; i < banks; i++) { 1444 for (i = 0; i < mca_cfg.banks; i++) {
1452 struct mce_bank *b = &mce_banks[i]; 1445 struct mce_bank *b = &mce_banks[i];
1453 1446
1454 if (!b->init) 1447 if (!b->init)
@@ -1489,6 +1482,8 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1489/* Add per CPU specific workarounds here */ 1482/* Add per CPU specific workarounds here */
1490static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1483static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1491{ 1484{
1485 struct mca_config *cfg = &mca_cfg;
1486
1492 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1487 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1493 pr_info("unknown CPU type - not enabling MCE support\n"); 1488 pr_info("unknown CPU type - not enabling MCE support\n");
1494 return -EOPNOTSUPP; 1489 return -EOPNOTSUPP;
@@ -1496,7 +1491,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1496 1491
1497 /* This should be disabled by the BIOS, but isn't always */ 1492 /* This should be disabled by the BIOS, but isn't always */
1498 if (c->x86_vendor == X86_VENDOR_AMD) { 1493 if (c->x86_vendor == X86_VENDOR_AMD) {
1499 if (c->x86 == 15 && banks > 4) { 1494 if (c->x86 == 15 && cfg->banks > 4) {
1500 /* 1495 /*
1501 * disable GART TBL walk error reporting, which 1496 * disable GART TBL walk error reporting, which
1502 * trips off incorrectly with the IOMMU & 3ware 1497 * trips off incorrectly with the IOMMU & 3ware
@@ -1504,18 +1499,18 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1504 */ 1499 */
1505 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1500 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1506 } 1501 }
1507 if (c->x86 <= 17 && mce_bootlog < 0) { 1502 if (c->x86 <= 17 && cfg->bootlog < 0) {
1508 /* 1503 /*
1509 * Lots of broken BIOS around that don't clear them 1504 * Lots of broken BIOS around that don't clear them
1510 * by default and leave crap in there. Don't log: 1505 * by default and leave crap in there. Don't log:
1511 */ 1506 */
1512 mce_bootlog = 0; 1507 cfg->bootlog = 0;
1513 } 1508 }
1514 /* 1509 /*
1515 * Various K7s with broken bank 0 around. Always disable 1510 * Various K7s with broken bank 0 around. Always disable
1516 * by default. 1511 * by default.
1517 */ 1512 */
1518 if (c->x86 == 6 && banks > 0) 1513 if (c->x86 == 6 && cfg->banks > 0)
1519 mce_banks[0].ctl = 0; 1514 mce_banks[0].ctl = 0;
1520 1515
1521 /* 1516 /*
@@ -1566,7 +1561,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1566 * valid event later, merely don't write CTL0. 1561 * valid event later, merely don't write CTL0.
1567 */ 1562 */
1568 1563
1569 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) 1564 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1570 mce_banks[0].init = 0; 1565 mce_banks[0].init = 0;
1571 1566
1572 /* 1567 /*
@@ -1574,23 +1569,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1574 * synchronization with a one second timeout. 1569 * synchronization with a one second timeout.
1575 */ 1570 */
1576 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1571 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1577 monarch_timeout < 0) 1572 cfg->monarch_timeout < 0)
1578 monarch_timeout = USEC_PER_SEC; 1573 cfg->monarch_timeout = USEC_PER_SEC;
1579 1574
1580 /* 1575 /*
1581 * There are also broken BIOSes on some Pentium M and 1576 * There are also broken BIOSes on some Pentium M and
1582 * earlier systems: 1577 * earlier systems:
1583 */ 1578 */
1584 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) 1579 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1585 mce_bootlog = 0; 1580 cfg->bootlog = 0;
1586 1581
1587 if (c->x86 == 6 && c->x86_model == 45) 1582 if (c->x86 == 6 && c->x86_model == 45)
1588 quirk_no_way_out = quirk_sandybridge_ifu; 1583 quirk_no_way_out = quirk_sandybridge_ifu;
1589 } 1584 }
1590 if (monarch_timeout < 0) 1585 if (cfg->monarch_timeout < 0)
1591 monarch_timeout = 0; 1586 cfg->monarch_timeout = 0;
1592 if (mce_bootlog != 0) 1587 if (cfg->bootlog != 0)
1593 mce_panic_timeout = 30; 1588 cfg->panic_timeout = 30;
1594 1589
1595 return 0; 1590 return 0;
1596} 1591}
@@ -1635,7 +1630,7 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1635 1630
1636 __this_cpu_write(mce_next_interval, iv); 1631 __this_cpu_write(mce_next_interval, iv);
1637 1632
1638 if (mce_ignore_ce || !iv) 1633 if (mca_cfg.ignore_ce || !iv)
1639 return; 1634 return;
1640 1635
1641 t->expires = round_jiffies(jiffies + iv); 1636 t->expires = round_jiffies(jiffies + iv);
@@ -1668,7 +1663,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1668 */ 1663 */
1669void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1664void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1670{ 1665{
1671 if (mce_disabled) 1666 if (mca_cfg.disabled)
1672 return; 1667 return;
1673 1668
1674 if (__mcheck_cpu_ancient_init(c)) 1669 if (__mcheck_cpu_ancient_init(c))
@@ -1678,7 +1673,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1678 return; 1673 return;
1679 1674
1680 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1675 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1681 mce_disabled = 1; 1676 mca_cfg.disabled = true;
1682 return; 1677 return;
1683 } 1678 }
1684 1679
@@ -1951,6 +1946,8 @@ static struct miscdevice mce_chrdev_device = {
1951 */ 1946 */
1952static int __init mcheck_enable(char *str) 1947static int __init mcheck_enable(char *str)
1953{ 1948{
1949 struct mca_config *cfg = &mca_cfg;
1950
1954 if (*str == 0) { 1951 if (*str == 0) {
1955 enable_p5_mce(); 1952 enable_p5_mce();
1956 return 1; 1953 return 1;
@@ -1958,22 +1955,22 @@ static int __init mcheck_enable(char *str)
1958 if (*str == '=') 1955 if (*str == '=')
1959 str++; 1956 str++;
1960 if (!strcmp(str, "off")) 1957 if (!strcmp(str, "off"))
1961 mce_disabled = 1; 1958 cfg->disabled = true;
1962 else if (!strcmp(str, "no_cmci")) 1959 else if (!strcmp(str, "no_cmci"))
1963 mce_cmci_disabled = 1; 1960 cfg->cmci_disabled = true;
1964 else if (!strcmp(str, "dont_log_ce")) 1961 else if (!strcmp(str, "dont_log_ce"))
1965 mce_dont_log_ce = 1; 1962 cfg->dont_log_ce = true;
1966 else if (!strcmp(str, "ignore_ce")) 1963 else if (!strcmp(str, "ignore_ce"))
1967 mce_ignore_ce = 1; 1964 cfg->ignore_ce = true;
1968 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1965 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1969 mce_bootlog = (str[0] == 'b'); 1966 cfg->bootlog = (str[0] == 'b');
1970 else if (!strcmp(str, "bios_cmci_threshold")) 1967 else if (!strcmp(str, "bios_cmci_threshold"))
1971 mce_bios_cmci_threshold = 1; 1968 cfg->bios_cmci_threshold = true;
1972 else if (isdigit(str[0])) { 1969 else if (isdigit(str[0])) {
1973 get_option(&str, &tolerant); 1970 get_option(&str, &(cfg->tolerant));
1974 if (*str == ',') { 1971 if (*str == ',') {
1975 ++str; 1972 ++str;
1976 get_option(&str, &monarch_timeout); 1973 get_option(&str, &(cfg->monarch_timeout));
1977 } 1974 }
1978 } else { 1975 } else {
1979 pr_info("mce argument %s ignored. Please use /sys\n", str); 1976 pr_info("mce argument %s ignored. Please use /sys\n", str);
@@ -2002,7 +1999,7 @@ static int mce_disable_error_reporting(void)
2002{ 1999{
2003 int i; 2000 int i;
2004 2001
2005 for (i = 0; i < banks; i++) { 2002 for (i = 0; i < mca_cfg.banks; i++) {
2006 struct mce_bank *b = &mce_banks[i]; 2003 struct mce_bank *b = &mce_banks[i];
2007 2004
2008 if (b->init) 2005 if (b->init)
@@ -2142,15 +2139,15 @@ static ssize_t set_ignore_ce(struct device *s,
2142 if (strict_strtoull(buf, 0, &new) < 0) 2139 if (strict_strtoull(buf, 0, &new) < 0)
2143 return -EINVAL; 2140 return -EINVAL;
2144 2141
2145 if (mce_ignore_ce ^ !!new) { 2142 if (mca_cfg.ignore_ce ^ !!new) {
2146 if (new) { 2143 if (new) {
2147 /* disable ce features */ 2144 /* disable ce features */
2148 mce_timer_delete_all(); 2145 mce_timer_delete_all();
2149 on_each_cpu(mce_disable_cmci, NULL, 1); 2146 on_each_cpu(mce_disable_cmci, NULL, 1);
2150 mce_ignore_ce = 1; 2147 mca_cfg.ignore_ce = true;
2151 } else { 2148 } else {
2152 /* enable ce features */ 2149 /* enable ce features */
2153 mce_ignore_ce = 0; 2150 mca_cfg.ignore_ce = false;
2154 on_each_cpu(mce_enable_ce, (void *)1, 1); 2151 on_each_cpu(mce_enable_ce, (void *)1, 1);
2155 } 2152 }
2156 } 2153 }
@@ -2166,14 +2163,14 @@ static ssize_t set_cmci_disabled(struct device *s,
2166 if (strict_strtoull(buf, 0, &new) < 0) 2163 if (strict_strtoull(buf, 0, &new) < 0)
2167 return -EINVAL; 2164 return -EINVAL;
2168 2165
2169 if (mce_cmci_disabled ^ !!new) { 2166 if (mca_cfg.cmci_disabled ^ !!new) {
2170 if (new) { 2167 if (new) {
2171 /* disable cmci */ 2168 /* disable cmci */
2172 on_each_cpu(mce_disable_cmci, NULL, 1); 2169 on_each_cpu(mce_disable_cmci, NULL, 1);
2173 mce_cmci_disabled = 1; 2170 mca_cfg.cmci_disabled = true;
2174 } else { 2171 } else {
2175 /* enable cmci */ 2172 /* enable cmci */
2176 mce_cmci_disabled = 0; 2173 mca_cfg.cmci_disabled = false;
2177 on_each_cpu(mce_enable_ce, NULL, 1); 2174 on_each_cpu(mce_enable_ce, NULL, 1);
2178 } 2175 }
2179 } 2176 }
@@ -2190,9 +2187,9 @@ static ssize_t store_int_with_restart(struct device *s,
2190} 2187}
2191 2188
2192static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 2189static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2193static DEVICE_INT_ATTR(tolerant, 0644, tolerant); 2190static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2194static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); 2191static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2195static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); 2192static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2196 2193
2197static struct dev_ext_attribute dev_attr_check_interval = { 2194static struct dev_ext_attribute dev_attr_check_interval = {
2198 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 2195 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
@@ -2200,13 +2197,13 @@ static struct dev_ext_attribute dev_attr_check_interval = {
2200}; 2197};
2201 2198
2202static struct dev_ext_attribute dev_attr_ignore_ce = { 2199static struct dev_ext_attribute dev_attr_ignore_ce = {
2203 __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), 2200 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2204 &mce_ignore_ce 2201 &mca_cfg.ignore_ce
2205}; 2202};
2206 2203
2207static struct dev_ext_attribute dev_attr_cmci_disabled = { 2204static struct dev_ext_attribute dev_attr_cmci_disabled = {
2208 __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), 2205 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2209 &mce_cmci_disabled 2206 &mca_cfg.cmci_disabled
2210}; 2207};
2211 2208
2212static struct device_attribute *mce_device_attrs[] = { 2209static struct device_attribute *mce_device_attrs[] = {
@@ -2253,7 +2250,7 @@ static __cpuinit int mce_device_create(unsigned int cpu)
2253 if (err) 2250 if (err)
2254 goto error; 2251 goto error;
2255 } 2252 }
2256 for (j = 0; j < banks; j++) { 2253 for (j = 0; j < mca_cfg.banks; j++) {
2257 err = device_create_file(dev, &mce_banks[j].attr); 2254 err = device_create_file(dev, &mce_banks[j].attr);
2258 if (err) 2255 if (err)
2259 goto error2; 2256 goto error2;
@@ -2285,7 +2282,7 @@ static __cpuinit void mce_device_remove(unsigned int cpu)
2285 for (i = 0; mce_device_attrs[i]; i++) 2282 for (i = 0; mce_device_attrs[i]; i++)
2286 device_remove_file(dev, mce_device_attrs[i]); 2283 device_remove_file(dev, mce_device_attrs[i]);
2287 2284
2288 for (i = 0; i < banks; i++) 2285 for (i = 0; i < mca_cfg.banks; i++)
2289 device_remove_file(dev, &mce_banks[i].attr); 2286 device_remove_file(dev, &mce_banks[i].attr);
2290 2287
2291 device_unregister(dev); 2288 device_unregister(dev);
@@ -2304,7 +2301,7 @@ static void __cpuinit mce_disable_cpu(void *h)
2304 2301
2305 if (!(action & CPU_TASKS_FROZEN)) 2302 if (!(action & CPU_TASKS_FROZEN))
2306 cmci_clear(); 2303 cmci_clear();
2307 for (i = 0; i < banks; i++) { 2304 for (i = 0; i < mca_cfg.banks; i++) {
2308 struct mce_bank *b = &mce_banks[i]; 2305 struct mce_bank *b = &mce_banks[i];
2309 2306
2310 if (b->init) 2307 if (b->init)
@@ -2322,7 +2319,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
2322 2319
2323 if (!(action & CPU_TASKS_FROZEN)) 2320 if (!(action & CPU_TASKS_FROZEN))
2324 cmci_reenable(); 2321 cmci_reenable();
2325 for (i = 0; i < banks; i++) { 2322 for (i = 0; i < mca_cfg.banks; i++) {
2326 struct mce_bank *b = &mce_banks[i]; 2323 struct mce_bank *b = &mce_banks[i];
2327 2324
2328 if (b->init) 2325 if (b->init)
@@ -2375,7 +2372,7 @@ static __init void mce_init_banks(void)
2375{ 2372{
2376 int i; 2373 int i;
2377 2374
2378 for (i = 0; i < banks; i++) { 2375 for (i = 0; i < mca_cfg.banks; i++) {
2379 struct mce_bank *b = &mce_banks[i]; 2376 struct mce_bank *b = &mce_banks[i];
2380 struct device_attribute *a = &b->attr; 2377 struct device_attribute *a = &b->attr;
2381 2378
@@ -2426,7 +2423,7 @@ device_initcall_sync(mcheck_init_device);
2426 */ 2423 */
2427static int __init mcheck_disable(char *str) 2424static int __init mcheck_disable(char *str)
2428{ 2425{
2429 mce_disabled = 1; 2426 mca_cfg.disabled = true;
2430 return 1; 2427 return 1;
2431} 2428}
2432__setup("nomce", mcheck_disable); 2429__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 4f9a3cbfc4a3..402c454fbff0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -53,7 +53,7 @@ static int cmci_supported(int *banks)
53{ 53{
54 u64 cap; 54 u64 cap;
55 55
56 if (mce_cmci_disabled || mce_ignore_ce) 56 if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce)
57 return 0; 57 return 0;
58 58
59 /* 59 /*
@@ -200,7 +200,7 @@ static void cmci_discover(int banks)
200 continue; 200 continue;
201 } 201 }
202 202
203 if (!mce_bios_cmci_threshold) { 203 if (!mca_cfg.bios_cmci_threshold) {
204 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; 204 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
205 val |= CMCI_THRESHOLD; 205 val |= CMCI_THRESHOLD;
206 } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { 206 } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
@@ -227,7 +227,7 @@ static void cmci_discover(int banks)
227 * set the thresholds properly or does not work with 227 * set the thresholds properly or does not work with
228 * this boot option. Note down now and report later. 228 * this boot option. Note down now and report later.
229 */ 229 */
230 if (mce_bios_cmci_threshold && bios_zero_thresh && 230 if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
231 (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) 231 (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
232 bios_wrong_thresh = 1; 232 bios_wrong_thresh = 1;
233 } else { 233 } else {
@@ -235,7 +235,7 @@ static void cmci_discover(int banks)
235 } 235 }
236 } 236 }
237 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 237 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
238 if (mce_bios_cmci_threshold && bios_wrong_thresh) { 238 if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
239 pr_info_once( 239 pr_info_once(
240 "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); 240 "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
241 pr_info_once( 241 pr_info_once(
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 2d5454cd2c4f..1c044b1ccc59 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -33,7 +33,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
33 smp_processor_id()); 33 smp_processor_id());
34 } 34 }
35 35
36 add_taint(TAINT_MACHINE_CHECK); 36 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
37} 37}
38 38
39/* Set up machine check reporting for processors with Intel style MCE: */ 39/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2d7998fb628c..e9a701aecaa1 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,7 +15,7 @@
15static void winchip_machine_check(struct pt_regs *regs, long error_code) 15static void winchip_machine_check(struct pt_regs *regs, long error_code)
16{ 16{
17 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 17 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
18 add_taint(TAINT_MACHINE_CHECK); 18 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
19} 19}
20 20
21/* Set up machine check reporting on the Winchip C6 series */ 21/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0a630dd4b620..a7d26d83fb70 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -14,10 +14,15 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/clocksource.h> 15#include <linux/clocksource.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/hardirq.h>
18#include <linux/interrupt.h>
17#include <asm/processor.h> 19#include <asm/processor.h>
18#include <asm/hypervisor.h> 20#include <asm/hypervisor.h>
19#include <asm/hyperv.h> 21#include <asm/hyperv.h>
20#include <asm/mshyperv.h> 22#include <asm/mshyperv.h>
23#include <asm/desc.h>
24#include <asm/idle.h>
25#include <asm/irq_regs.h>
21 26
22struct ms_hyperv_info ms_hyperv; 27struct ms_hyperv_info ms_hyperv;
23EXPORT_SYMBOL_GPL(ms_hyperv); 28EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -30,6 +35,13 @@ static bool __init ms_hyperv_platform(void)
30 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) 35 if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
31 return false; 36 return false;
32 37
38 /*
39 * Xen emulates Hyper-V to support enlightened Windows.
40 * Check to see first if we are on a Xen Hypervisor.
41 */
42 if (xen_cpuid_base())
43 return false;
44
33 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, 45 cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
34 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); 46 &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
35 47
@@ -68,7 +80,14 @@ static void __init ms_hyperv_init_platform(void)
68 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", 80 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
69 ms_hyperv.features, ms_hyperv.hints); 81 ms_hyperv.features, ms_hyperv.hints);
70 82
71 clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); 83 if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
84 clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
85#if IS_ENABLED(CONFIG_HYPERV)
86 /*
87 * Setup the IDT for hypervisor callback.
88 */
89 alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
90#endif
72} 91}
73 92
74const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { 93const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
@@ -77,3 +96,36 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
77 .init_platform = ms_hyperv_init_platform, 96 .init_platform = ms_hyperv_init_platform,
78}; 97};
79EXPORT_SYMBOL(x86_hyper_ms_hyperv); 98EXPORT_SYMBOL(x86_hyper_ms_hyperv);
99
100#if IS_ENABLED(CONFIG_HYPERV)
101static int vmbus_irq = -1;
102static irq_handler_t vmbus_isr;
103
104void hv_register_vmbus_handler(int irq, irq_handler_t handler)
105{
106 vmbus_irq = irq;
107 vmbus_isr = handler;
108}
109
110void hyperv_vector_handler(struct pt_regs *regs)
111{
112 struct pt_regs *old_regs = set_irq_regs(regs);
113 struct irq_desc *desc;
114
115 irq_enter();
116 exit_idle();
117
118 desc = irq_to_desc(vmbus_irq);
119
120 if (desc)
121 generic_handle_irq_desc(vmbus_irq, desc);
122
123 irq_exit();
124 set_irq_regs(old_regs);
125}
126#else
127void hv_register_vmbus_handler(int irq, irq_handler_t handler)
128{
129}
130#endif
131EXPORT_SYMBOL_GPL(hv_register_vmbus_handler);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index e9fe907cd249..fa72a39e5d46 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -542,7 +542,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
542 542
543 if (tmp != mask_lo) { 543 if (tmp != mask_lo) {
544 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); 544 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
545 add_taint(TAINT_FIRMWARE_WORKAROUND); 545 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
546 mask_lo = tmp; 546 mask_lo = tmp;
547 } 547 }
548 } 548 }
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4428fd178bce..bf0f01aea994 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -340,9 +340,6 @@ int x86_setup_perfctr(struct perf_event *event)
340 /* BTS is currently only allowed for user-mode. */ 340 /* BTS is currently only allowed for user-mode. */
341 if (!attr->exclude_kernel) 341 if (!attr->exclude_kernel)
342 return -EOPNOTSUPP; 342 return -EOPNOTSUPP;
343
344 if (!attr->exclude_guest)
345 return -EOPNOTSUPP;
346 } 343 }
347 344
348 hwc->config |= config; 345 hwc->config |= config;
@@ -385,9 +382,6 @@ int x86_pmu_hw_config(struct perf_event *event)
385 if (event->attr.precise_ip) { 382 if (event->attr.precise_ip) {
386 int precise = 0; 383 int precise = 0;
387 384
388 if (!event->attr.exclude_guest)
389 return -EOPNOTSUPP;
390
391 /* Support for constant skid */ 385 /* Support for constant skid */
392 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 386 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
393 precise++; 387 precise++;
@@ -835,7 +829,7 @@ static inline void x86_assign_hw_event(struct perf_event *event,
835 } else { 829 } else {
836 hwc->config_base = x86_pmu_config_addr(hwc->idx); 830 hwc->config_base = x86_pmu_config_addr(hwc->idx);
837 hwc->event_base = x86_pmu_event_addr(hwc->idx); 831 hwc->event_base = x86_pmu_event_addr(hwc->idx);
838 hwc->event_base_rdpmc = hwc->idx; 832 hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
839 } 833 }
840} 834}
841 835
@@ -1316,11 +1310,6 @@ static struct attribute_group x86_pmu_format_group = {
1316 .attrs = NULL, 1310 .attrs = NULL,
1317}; 1311};
1318 1312
1319struct perf_pmu_events_attr {
1320 struct device_attribute attr;
1321 u64 id;
1322};
1323
1324/* 1313/*
1325 * Remove all undefined events (x86_pmu.event_map(id) == 0) 1314 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1326 * out of events_attr attributes. 1315 * out of events_attr attributes.
@@ -1354,11 +1343,9 @@ static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *at
1354#define EVENT_VAR(_id) event_attr_##_id 1343#define EVENT_VAR(_id) event_attr_##_id
1355#define EVENT_PTR(_id) &event_attr_##_id.attr.attr 1344#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
1356 1345
1357#define EVENT_ATTR(_name, _id) \ 1346#define EVENT_ATTR(_name, _id) \
1358static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ 1347 PMU_EVENT_ATTR(_name, EVENT_VAR(_id), PERF_COUNT_HW_##_id, \
1359 .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ 1348 events_sysfs_show)
1360 .id = PERF_COUNT_HW_##_id, \
1361};
1362 1349
1363EVENT_ATTR(cpu-cycles, CPU_CYCLES ); 1350EVENT_ATTR(cpu-cycles, CPU_CYCLES );
1364EVENT_ATTR(instructions, INSTRUCTIONS ); 1351EVENT_ATTR(instructions, INSTRUCTIONS );
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 115c1ea97746..7f5c75c2afdd 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -325,6 +325,8 @@ struct x86_pmu {
325 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); 325 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
326 unsigned eventsel; 326 unsigned eventsel;
327 unsigned perfctr; 327 unsigned perfctr;
328 int (*addr_offset)(int index, bool eventsel);
329 int (*rdpmc_index)(int index);
328 u64 (*event_map)(int); 330 u64 (*event_map)(int);
329 int max_events; 331 int max_events;
330 int num_counters; 332 int num_counters;
@@ -446,28 +448,21 @@ extern u64 __read_mostly hw_cache_extra_regs
446 448
447u64 x86_perf_event_update(struct perf_event *event); 449u64 x86_perf_event_update(struct perf_event *event);
448 450
449static inline int x86_pmu_addr_offset(int index) 451static inline unsigned int x86_pmu_config_addr(int index)
450{ 452{
451 int offset; 453 return x86_pmu.eventsel + (x86_pmu.addr_offset ?
452 454 x86_pmu.addr_offset(index, true) : index);
453 /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
454 alternative_io(ASM_NOP2,
455 "shll $1, %%eax",
456 X86_FEATURE_PERFCTR_CORE,
457 "=a" (offset),
458 "a" (index));
459
460 return offset;
461} 455}
462 456
463static inline unsigned int x86_pmu_config_addr(int index) 457static inline unsigned int x86_pmu_event_addr(int index)
464{ 458{
465 return x86_pmu.eventsel + x86_pmu_addr_offset(index); 459 return x86_pmu.perfctr + (x86_pmu.addr_offset ?
460 x86_pmu.addr_offset(index, false) : index);
466} 461}
467 462
468static inline unsigned int x86_pmu_event_addr(int index) 463static inline int x86_pmu_rdpmc_index(int index)
469{ 464{
470 return x86_pmu.perfctr + x86_pmu_addr_offset(index); 465 return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
471} 466}
472 467
473int x86_setup_perfctr(struct perf_event *event); 468int x86_setup_perfctr(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c93bc4e813a0..dfdab42aed27 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -132,21 +132,102 @@ static u64 amd_pmu_event_map(int hw_event)
132 return amd_perfmon_event_map[hw_event]; 132 return amd_perfmon_event_map[hw_event];
133} 133}
134 134
135static int amd_pmu_hw_config(struct perf_event *event) 135static struct event_constraint *amd_nb_event_constraint;
136
137/*
138 * Previously calculated offsets
139 */
140static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly;
141static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly;
142static unsigned int rdpmc_indexes[X86_PMC_IDX_MAX] __read_mostly;
143
144/*
145 * Legacy CPUs:
146 * 4 counters starting at 0xc0010000 each offset by 1
147 *
148 * CPUs with core performance counter extensions:
149 * 6 counters starting at 0xc0010200 each offset by 2
150 *
151 * CPUs with north bridge performance counter extensions:
152 * 4 additional counters starting at 0xc0010240 each offset by 2
153 * (indexed right above either one of the above core counters)
154 */
155static inline int amd_pmu_addr_offset(int index, bool eventsel)
136{ 156{
137 int ret; 157 int offset, first, base;
138 158
139 /* pass precise event sampling to ibs: */ 159 if (!index)
140 if (event->attr.precise_ip && get_ibs_caps()) 160 return index;
141 return -ENOENT; 161
162 if (eventsel)
163 offset = event_offsets[index];
164 else
165 offset = count_offsets[index];
166
167 if (offset)
168 return offset;
169
170 if (amd_nb_event_constraint &&
171 test_bit(index, amd_nb_event_constraint->idxmsk)) {
172 /*
173 * calculate the offset of NB counters with respect to
174 * base eventsel or perfctr
175 */
176
177 first = find_first_bit(amd_nb_event_constraint->idxmsk,
178 X86_PMC_IDX_MAX);
179
180 if (eventsel)
181 base = MSR_F15H_NB_PERF_CTL - x86_pmu.eventsel;
182 else
183 base = MSR_F15H_NB_PERF_CTR - x86_pmu.perfctr;
184
185 offset = base + ((index - first) << 1);
186 } else if (!cpu_has_perfctr_core)
187 offset = index;
188 else
189 offset = index << 1;
190
191 if (eventsel)
192 event_offsets[index] = offset;
193 else
194 count_offsets[index] = offset;
195
196 return offset;
197}
198
199static inline int amd_pmu_rdpmc_index(int index)
200{
201 int ret, first;
202
203 if (!index)
204 return index;
205
206 ret = rdpmc_indexes[index];
142 207
143 ret = x86_pmu_hw_config(event);
144 if (ret) 208 if (ret)
145 return ret; 209 return ret;
146 210
147 if (has_branch_stack(event)) 211 if (amd_nb_event_constraint &&
148 return -EOPNOTSUPP; 212 test_bit(index, amd_nb_event_constraint->idxmsk)) {
213 /*
214 * according to the mnual, ECX value of the NB counters is
215 * the index of the NB counter (0, 1, 2 or 3) plus 6
216 */
217
218 first = find_first_bit(amd_nb_event_constraint->idxmsk,
219 X86_PMC_IDX_MAX);
220 ret = index - first + 6;
221 } else
222 ret = index;
223
224 rdpmc_indexes[index] = ret;
225
226 return ret;
227}
149 228
229static int amd_core_hw_config(struct perf_event *event)
230{
150 if (event->attr.exclude_host && event->attr.exclude_guest) 231 if (event->attr.exclude_host && event->attr.exclude_guest)
151 /* 232 /*
152 * When HO == GO == 1 the hardware treats that as GO == HO == 0 233 * When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -156,14 +237,37 @@ static int amd_pmu_hw_config(struct perf_event *event)
156 event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | 237 event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
157 ARCH_PERFMON_EVENTSEL_OS); 238 ARCH_PERFMON_EVENTSEL_OS);
158 else if (event->attr.exclude_host) 239 else if (event->attr.exclude_host)
159 event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY; 240 event->hw.config |= AMD64_EVENTSEL_GUESTONLY;
160 else if (event->attr.exclude_guest) 241 else if (event->attr.exclude_guest)
161 event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY; 242 event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
243
244 return 0;
245}
246
247/*
248 * NB counters do not support the following event select bits:
249 * Host/Guest only
250 * Counter mask
251 * Invert counter mask
252 * Edge detect
253 * OS/User mode
254 */
255static int amd_nb_hw_config(struct perf_event *event)
256{
257 /* for NB, we only allow system wide counting mode */
258 if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
259 return -EINVAL;
260
261 if (event->attr.exclude_user || event->attr.exclude_kernel ||
262 event->attr.exclude_host || event->attr.exclude_guest)
263 return -EINVAL;
162 264
163 if (event->attr.type != PERF_TYPE_RAW) 265 event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
164 return 0; 266 ARCH_PERFMON_EVENTSEL_OS);
165 267
166 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; 268 if (event->hw.config & ~(AMD64_RAW_EVENT_MASK_NB |
269 ARCH_PERFMON_EVENTSEL_INT))
270 return -EINVAL;
167 271
168 return 0; 272 return 0;
169} 273}
@@ -181,6 +285,11 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc)
181 return (hwc->config & 0xe0) == 0xe0; 285 return (hwc->config & 0xe0) == 0xe0;
182} 286}
183 287
288static inline int amd_is_perfctr_nb_event(struct hw_perf_event *hwc)
289{
290 return amd_nb_event_constraint && amd_is_nb_event(hwc);
291}
292
184static inline int amd_has_nb(struct cpu_hw_events *cpuc) 293static inline int amd_has_nb(struct cpu_hw_events *cpuc)
185{ 294{
186 struct amd_nb *nb = cpuc->amd_nb; 295 struct amd_nb *nb = cpuc->amd_nb;
@@ -188,20 +297,37 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc)
188 return nb && nb->nb_id != -1; 297 return nb && nb->nb_id != -1;
189} 298}
190 299
191static void amd_put_event_constraints(struct cpu_hw_events *cpuc, 300static int amd_pmu_hw_config(struct perf_event *event)
192 struct perf_event *event) 301{
302 int ret;
303
304 /* pass precise event sampling to ibs: */
305 if (event->attr.precise_ip && get_ibs_caps())
306 return -ENOENT;
307
308 if (has_branch_stack(event))
309 return -EOPNOTSUPP;
310
311 ret = x86_pmu_hw_config(event);
312 if (ret)
313 return ret;
314
315 if (event->attr.type == PERF_TYPE_RAW)
316 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
317
318 if (amd_is_perfctr_nb_event(&event->hw))
319 return amd_nb_hw_config(event);
320
321 return amd_core_hw_config(event);
322}
323
324static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc,
325 struct perf_event *event)
193{ 326{
194 struct hw_perf_event *hwc = &event->hw;
195 struct amd_nb *nb = cpuc->amd_nb; 327 struct amd_nb *nb = cpuc->amd_nb;
196 int i; 328 int i;
197 329
198 /* 330 /*
199 * only care about NB events
200 */
201 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
202 return;
203
204 /*
205 * need to scan whole list because event may not have 331 * need to scan whole list because event may not have
206 * been assigned during scheduling 332 * been assigned during scheduling
207 * 333 *
@@ -215,6 +341,19 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
215 } 341 }
216} 342}
217 343
344static void amd_nb_interrupt_hw_config(struct hw_perf_event *hwc)
345{
346 int core_id = cpu_data(smp_processor_id()).cpu_core_id;
347
348 /* deliver interrupts only to this core */
349 if (hwc->config & ARCH_PERFMON_EVENTSEL_INT) {
350 hwc->config |= AMD64_EVENTSEL_INT_CORE_ENABLE;
351 hwc->config &= ~AMD64_EVENTSEL_INT_CORE_SEL_MASK;
352 hwc->config |= (u64)(core_id) <<
353 AMD64_EVENTSEL_INT_CORE_SEL_SHIFT;
354 }
355}
356
218 /* 357 /*
219 * AMD64 NorthBridge events need special treatment because 358 * AMD64 NorthBridge events need special treatment because
220 * counter access needs to be synchronized across all cores 359 * counter access needs to be synchronized across all cores
@@ -247,24 +386,24 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
247 * 386 *
248 * Given that resources are allocated (cmpxchg), they must be 387 * Given that resources are allocated (cmpxchg), they must be
249 * eventually freed for others to use. This is accomplished by 388 * eventually freed for others to use. This is accomplished by
250 * calling amd_put_event_constraints(). 389 * calling __amd_put_nb_event_constraints()
251 * 390 *
252 * Non NB events are not impacted by this restriction. 391 * Non NB events are not impacted by this restriction.
253 */ 392 */
254static struct event_constraint * 393static struct event_constraint *
255amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 394__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
395 struct event_constraint *c)
256{ 396{
257 struct hw_perf_event *hwc = &event->hw; 397 struct hw_perf_event *hwc = &event->hw;
258 struct amd_nb *nb = cpuc->amd_nb; 398 struct amd_nb *nb = cpuc->amd_nb;
259 struct perf_event *old = NULL; 399 struct perf_event *old;
260 int max = x86_pmu.num_counters; 400 int idx, new = -1;
261 int i, j, k = -1;
262 401
263 /* 402 if (!c)
264 * if not NB event or no NB, then no constraints 403 c = &unconstrained;
265 */ 404
266 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) 405 if (cpuc->is_fake)
267 return &unconstrained; 406 return c;
268 407
269 /* 408 /*
270 * detect if already present, if so reuse 409 * detect if already present, if so reuse
@@ -276,48 +415,36 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
276 * because of successive calls to x86_schedule_events() from 415 * because of successive calls to x86_schedule_events() from
277 * hw_perf_group_sched_in() without hw_perf_enable() 416 * hw_perf_group_sched_in() without hw_perf_enable()
278 */ 417 */
279 for (i = 0; i < max; i++) { 418 for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) {
280 /* 419 if (new == -1 || hwc->idx == idx)
281 * keep track of first free slot 420 /* assign free slot, prefer hwc->idx */
282 */ 421 old = cmpxchg(nb->owners + idx, NULL, event);
283 if (k == -1 && !nb->owners[i]) 422 else if (nb->owners[idx] == event)
284 k = i; 423 /* event already present */
424 old = event;
425 else
426 continue;
427
428 if (old && old != event)
429 continue;
430
431 /* reassign to this slot */
432 if (new != -1)
433 cmpxchg(nb->owners + new, event, NULL);
434 new = idx;
285 435
286 /* already present, reuse */ 436 /* already present, reuse */
287 if (nb->owners[i] == event) 437 if (old == event)
288 goto done;
289 }
290 /*
291 * not present, so grab a new slot
292 * starting either at:
293 */
294 if (hwc->idx != -1) {
295 /* previous assignment */
296 i = hwc->idx;
297 } else if (k != -1) {
298 /* start from free slot found */
299 i = k;
300 } else {
301 /*
302 * event not found, no slot found in
303 * first pass, try again from the
304 * beginning
305 */
306 i = 0;
307 }
308 j = i;
309 do {
310 old = cmpxchg(nb->owners+i, NULL, event);
311 if (!old)
312 break; 438 break;
313 if (++i == max) 439 }
314 i = 0; 440
315 } while (i != j); 441 if (new == -1)
316done: 442 return &emptyconstraint;
317 if (!old) 443
318 return &nb->event_constraints[i]; 444 if (amd_is_perfctr_nb_event(hwc))
319 445 amd_nb_interrupt_hw_config(hwc);
320 return &emptyconstraint; 446
447 return &nb->event_constraints[new];
321} 448}
322 449
323static struct amd_nb *amd_alloc_nb(int cpu) 450static struct amd_nb *amd_alloc_nb(int cpu)
@@ -364,7 +491,7 @@ static void amd_pmu_cpu_starting(int cpu)
364 struct amd_nb *nb; 491 struct amd_nb *nb;
365 int i, nb_id; 492 int i, nb_id;
366 493
367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; 494 cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
368 495
369 if (boot_cpu_data.x86_max_cores < 2) 496 if (boot_cpu_data.x86_max_cores < 2)
370 return; 497 return;
@@ -407,6 +534,26 @@ static void amd_pmu_cpu_dead(int cpu)
407 } 534 }
408} 535}
409 536
537static struct event_constraint *
538amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
539{
540 /*
541 * if not NB event or no NB, then no constraints
542 */
543 if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)))
544 return &unconstrained;
545
546 return __amd_get_nb_event_constraints(cpuc, event,
547 amd_nb_event_constraint);
548}
549
550static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
551 struct perf_event *event)
552{
553 if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))
554 __amd_put_nb_event_constraints(cpuc, event);
555}
556
410PMU_FORMAT_ATTR(event, "config:0-7,32-35"); 557PMU_FORMAT_ATTR(event, "config:0-7,32-35");
411PMU_FORMAT_ATTR(umask, "config:8-15" ); 558PMU_FORMAT_ATTR(umask, "config:8-15" );
412PMU_FORMAT_ATTR(edge, "config:18" ); 559PMU_FORMAT_ATTR(edge, "config:18" );
@@ -496,6 +643,9 @@ static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09,
496static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); 643static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
497static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); 644static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
498 645
646static struct event_constraint amd_NBPMC96 = EVENT_CONSTRAINT(0, 0x3C0, 0);
647static struct event_constraint amd_NBPMC74 = EVENT_CONSTRAINT(0, 0xF0, 0);
648
499static struct event_constraint * 649static struct event_constraint *
500amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) 650amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
501{ 651{
@@ -561,8 +711,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
561 return &amd_f15_PMC20; 711 return &amd_f15_PMC20;
562 } 712 }
563 case AMD_EVENT_NB: 713 case AMD_EVENT_NB:
564 /* not yet implemented */ 714 return __amd_get_nb_event_constraints(cpuc, event,
565 return &emptyconstraint; 715 amd_nb_event_constraint);
566 default: 716 default:
567 return &emptyconstraint; 717 return &emptyconstraint;
568 } 718 }
@@ -587,6 +737,8 @@ static __initconst const struct x86_pmu amd_pmu = {
587 .schedule_events = x86_schedule_events, 737 .schedule_events = x86_schedule_events,
588 .eventsel = MSR_K7_EVNTSEL0, 738 .eventsel = MSR_K7_EVNTSEL0,
589 .perfctr = MSR_K7_PERFCTR0, 739 .perfctr = MSR_K7_PERFCTR0,
740 .addr_offset = amd_pmu_addr_offset,
741 .rdpmc_index = amd_pmu_rdpmc_index,
590 .event_map = amd_pmu_event_map, 742 .event_map = amd_pmu_event_map,
591 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 743 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
592 .num_counters = AMD64_NUM_COUNTERS, 744 .num_counters = AMD64_NUM_COUNTERS,
@@ -608,7 +760,7 @@ static __initconst const struct x86_pmu amd_pmu = {
608 760
609static int setup_event_constraints(void) 761static int setup_event_constraints(void)
610{ 762{
611 if (boot_cpu_data.x86 >= 0x15) 763 if (boot_cpu_data.x86 == 0x15)
612 x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; 764 x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
613 return 0; 765 return 0;
614} 766}
@@ -638,6 +790,23 @@ static int setup_perfctr_core(void)
638 return 0; 790 return 0;
639} 791}
640 792
793static int setup_perfctr_nb(void)
794{
795 if (!cpu_has_perfctr_nb)
796 return -ENODEV;
797
798 x86_pmu.num_counters += AMD64_NUM_COUNTERS_NB;
799
800 if (cpu_has_perfctr_core)
801 amd_nb_event_constraint = &amd_NBPMC96;
802 else
803 amd_nb_event_constraint = &amd_NBPMC74;
804
805 printk(KERN_INFO "perf: AMD northbridge performance counters detected\n");
806
807 return 0;
808}
809
641__init int amd_pmu_init(void) 810__init int amd_pmu_init(void)
642{ 811{
643 /* Performance-monitoring supported from K7 and later: */ 812 /* Performance-monitoring supported from K7 and later: */
@@ -648,6 +817,7 @@ __init int amd_pmu_init(void)
648 817
649 setup_event_constraints(); 818 setup_event_constraints();
650 setup_perfctr_core(); 819 setup_perfctr_core();
820 setup_perfctr_nb();
651 821
652 /* Events are common for all AMDs */ 822 /* Events are common for all AMDs */
653 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 823 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
@@ -678,7 +848,7 @@ void amd_pmu_disable_virt(void)
678 * SVM is disabled the Guest-only bits still gets set and the counter 848 * SVM is disabled the Guest-only bits still gets set and the counter
679 * will not count anything. 849 * will not count anything.
680 */ 850 */
681 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; 851 cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
682 852
683 /* Reload all events */ 853 /* Reload all events */
684 x86_pmu_disable_all(); 854 x86_pmu_disable_all();
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 6336bcbd0618..5f0581e713c2 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -528,7 +528,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
528 if (!test_bit(IBS_STARTED, pcpu->state)) { 528 if (!test_bit(IBS_STARTED, pcpu->state)) {
529 /* 529 /*
530 * Catch spurious interrupts after stopping IBS: After 530 * Catch spurious interrupts after stopping IBS: After
531 * disabling IBS there could be still incomming NMIs 531 * disabling IBS there could be still incoming NMIs
532 * with samples that even have the valid bit cleared. 532 * with samples that even have the valid bit cleared.
533 * Mark all this NMIs as handled. 533 * Mark all this NMIs as handled.
534 */ 534 */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 93b9e1181f83..529c8931fc02 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -107,6 +107,27 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
107 EVENT_CONSTRAINT_END 107 EVENT_CONSTRAINT_END
108}; 108};
109 109
110static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
111{
112 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
113 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
114 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
115 INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
116 INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
117 INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
118 INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
119 INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */
120 INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */
121 INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
122 INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
123 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
124 INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
125 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
126 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
127 INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
128 EVENT_CONSTRAINT_END
129};
130
110static struct extra_reg intel_westmere_extra_regs[] __read_mostly = 131static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
111{ 132{
112 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), 133 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
@@ -2019,7 +2040,10 @@ __init int intel_pmu_init(void)
2019 break; 2040 break;
2020 2041
2021 case 28: /* Atom */ 2042 case 28: /* Atom */
2022 case 54: /* Cedariew */ 2043 case 38: /* Lincroft */
2044 case 39: /* Penwell */
2045 case 53: /* Cloverview */
2046 case 54: /* Cedarview */
2023 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 2047 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2024 sizeof(hw_cache_event_ids)); 2048 sizeof(hw_cache_event_ids));
2025 2049
@@ -2084,6 +2108,7 @@ __init int intel_pmu_init(void)
2084 pr_cont("SandyBridge events, "); 2108 pr_cont("SandyBridge events, ");
2085 break; 2109 break;
2086 case 58: /* IvyBridge */ 2110 case 58: /* IvyBridge */
2111 case 62: /* IvyBridge EP */
2087 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 2112 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2088 sizeof(hw_cache_event_ids)); 2113 sizeof(hw_cache_event_ids));
2089 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, 2114 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2091,7 +2116,7 @@ __init int intel_pmu_init(void)
2091 2116
2092 intel_pmu_lbr_init_snb(); 2117 intel_pmu_lbr_init_snb();
2093 2118
2094 x86_pmu.event_constraints = intel_snb_event_constraints; 2119 x86_pmu.event_constraints = intel_ivb_event_constraints;
2095 x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; 2120 x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
2096 x86_pmu.pebs_aliases = intel_pebs_aliases_snb; 2121 x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
2097 x86_pmu.extra_regs = intel_snb_extra_regs; 2122 x86_pmu.extra_regs = intel_snb_extra_regs;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 3cf3d97cce3a..b43200dbfe7e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2500,7 +2500,7 @@ static bool pcidrv_registered;
2500/* 2500/*
2501 * add a pci uncore device 2501 * add a pci uncore device
2502 */ 2502 */
2503static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) 2503static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
2504{ 2504{
2505 struct intel_uncore_pmu *pmu; 2505 struct intel_uncore_pmu *pmu;
2506 struct intel_uncore_box *box; 2506 struct intel_uncore_box *box;
@@ -2571,8 +2571,8 @@ static void uncore_pci_remove(struct pci_dev *pdev)
2571 kfree(box); 2571 kfree(box);
2572} 2572}
2573 2573
2574static int __devinit uncore_pci_probe(struct pci_dev *pdev, 2574static int uncore_pci_probe(struct pci_dev *pdev,
2575 const struct pci_device_id *id) 2575 const struct pci_device_id *id)
2576{ 2576{
2577 struct intel_uncore_type *type; 2577 struct intel_uncore_type *type;
2578 2578
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index f2af39f5dc3d..4820c232a0b9 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -19,7 +19,7 @@ static const u64 p6_perfmon_event_map[] =
19 19
20}; 20};
21 21
22static __initconst u64 p6_hw_cache_event_ids 22static u64 p6_hw_cache_event_ids
23 [PERF_COUNT_HW_CACHE_MAX] 23 [PERF_COUNT_HW_CACHE_MAX]
24 [PERF_COUNT_HW_CACHE_OP_MAX] 24 [PERF_COUNT_HW_CACHE_OP_MAX]
25 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 25 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index fbd895562292..e280253f6f94 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -26,14 +26,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
26#ifdef CONFIG_X86_32 26#ifdef CONFIG_X86_32
27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
28{ 28{
29 /*
30 * We use exception 16 if we have hardware math and we've either seen
31 * it or the CPU claims it is internal
32 */
33 int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
34 seq_printf(m, 29 seq_printf(m,
35 "fdiv_bug\t: %s\n" 30 "fdiv_bug\t: %s\n"
36 "hlt_bug\t\t: %s\n"
37 "f00f_bug\t: %s\n" 31 "f00f_bug\t: %s\n"
38 "coma_bug\t: %s\n" 32 "coma_bug\t: %s\n"
39 "fpu\t\t: %s\n" 33 "fpu\t\t: %s\n"
@@ -41,11 +35,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
41 "cpuid level\t: %d\n" 35 "cpuid level\t: %d\n"
42 "wp\t\t: %s\n", 36 "wp\t\t: %s\n",
43 c->fdiv_bug ? "yes" : "no", 37 c->fdiv_bug ? "yes" : "no",
44 c->hlt_works_ok ? "no" : "yes",
45 c->f00f_bug ? "yes" : "no", 38 c->f00f_bug ? "yes" : "no",
46 c->coma_bug ? "yes" : "no", 39 c->coma_bug ? "yes" : "no",
47 c->hard_math ? "yes" : "no", 40 c->hard_math ? "yes" : "no",
48 fpu_exception ? "yes" : "no", 41 c->hard_math ? "yes" : "no",
49 c->cpuid_level, 42 c->cpuid_level,
50 c->wp_works_ok ? "yes" : "no"); 43 c->wp_works_ok ? "yes" : "no");
51} 44}
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index d22d0c4edcfd..03a36321ec54 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -33,6 +33,9 @@
33 33
34#define VMWARE_PORT_CMD_GETVERSION 10 34#define VMWARE_PORT_CMD_GETVERSION 10
35#define VMWARE_PORT_CMD_GETHZ 45 35#define VMWARE_PORT_CMD_GETHZ 45
36#define VMWARE_PORT_CMD_GETVCPU_INFO 68
37#define VMWARE_PORT_CMD_LEGACY_X2APIC 3
38#define VMWARE_PORT_CMD_VCPU_RESERVED 31
36 39
37#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ 40#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
38 __asm__("inl (%%dx)" : \ 41 __asm__("inl (%%dx)" : \
@@ -125,10 +128,20 @@ static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
125 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 128 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
126} 129}
127 130
131/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */
132static bool __init vmware_legacy_x2apic_available(void)
133{
134 uint32_t eax, ebx, ecx, edx;
135 VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx);
136 return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 &&
137 (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0;
138}
139
128const __refconst struct hypervisor_x86 x86_hyper_vmware = { 140const __refconst struct hypervisor_x86 x86_hyper_vmware = {
129 .name = "VMware", 141 .name = "VMware",
130 .detect = vmware_platform, 142 .detect = vmware_platform,
131 .set_cpu_features = vmware_set_cpu_features, 143 .set_cpu_features = vmware_set_cpu_features,
132 .init_platform = vmware_platform_setup, 144 .init_platform = vmware_platform_setup,
145 .x2apic_available = vmware_legacy_x2apic_available,
133}; 146};
134EXPORT_SYMBOL(x86_hyper_vmware); 147EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 60c78917190c..1e4dbcfe6d31 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -85,7 +85,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
85{ 85{
86 char __user *tmp = buf; 86 char __user *tmp = buf;
87 struct cpuid_regs cmd; 87 struct cpuid_regs cmd;
88 int cpu = iminor(file->f_path.dentry->d_inode); 88 int cpu = iminor(file_inode(file));
89 u64 pos = *ppos; 89 u64 pos = *ppos;
90 ssize_t bytes = 0; 90 ssize_t bytes = 0;
91 int err = 0; 91 int err = 0;
@@ -116,7 +116,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
116 unsigned int cpu; 116 unsigned int cpu;
117 struct cpuinfo_x86 *c; 117 struct cpuinfo_x86 *c;
118 118
119 cpu = iminor(file->f_path.dentry->d_inode); 119 cpu = iminor(file_inode(file));
120 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 120 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
121 return -ENXIO; /* No such CPU */ 121 return -ENXIO; /* No such CPU */
122 122
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ae42418bc50f..c8797d55b245 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -232,7 +232,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
232 232
233 bust_spinlocks(0); 233 bust_spinlocks(0);
234 die_owner = -1; 234 die_owner = -1;
235 add_taint(TAINT_DIE); 235 add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
236 die_nest_count--; 236 die_nest_count--;
237 if (!die_nest_count) 237 if (!die_nest_count)
238 /* Nest count reaches zero, release the lock. */ 238 /* Nest count reaches zero, release the lock. */
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26bef..d32abeabbda5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
835} 835}
836early_param("mem", parse_memopt); 836early_param("mem", parse_memopt);
837 837
838static int __init parse_memmap_opt(char *p) 838static int __init parse_memmap_one(char *p)
839{ 839{
840 char *oldp; 840 char *oldp;
841 u64 start_at, mem_size; 841 u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
877 877
878 return *p == '\0' ? 0 : -EINVAL; 878 return *p == '\0' ? 0 : -EINVAL;
879} 879}
880static int __init parse_memmap_opt(char *str)
881{
882 while (str) {
883 char *k = strchr(str, ',');
884
885 if (k)
886 *k++ = 0;
887
888 parse_memmap_one(str);
889 str = k;
890 }
891
892 return 0;
893}
880early_param("memmap", parse_memmap_opt); 894early_param("memmap", parse_memmap_opt);
881 895
882void __init finish_e820_parsing(void) 896void __init finish_e820_parsing(void)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c763116c5359..8f3e2dec1df3 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -699,52 +699,6 @@ END(syscall_badsys)
699 */ 699 */
700 .popsection 700 .popsection
701 701
702/*
703 * System calls that need a pt_regs pointer.
704 */
705#define PTREGSCALL0(name) \
706ENTRY(ptregs_##name) ; \
707 leal 4(%esp),%eax; \
708 jmp sys_##name; \
709ENDPROC(ptregs_##name)
710
711#define PTREGSCALL1(name) \
712ENTRY(ptregs_##name) ; \
713 leal 4(%esp),%edx; \
714 movl (PT_EBX+4)(%esp),%eax; \
715 jmp sys_##name; \
716ENDPROC(ptregs_##name)
717
718#define PTREGSCALL2(name) \
719ENTRY(ptregs_##name) ; \
720 leal 4(%esp),%ecx; \
721 movl (PT_ECX+4)(%esp),%edx; \
722 movl (PT_EBX+4)(%esp),%eax; \
723 jmp sys_##name; \
724ENDPROC(ptregs_##name)
725
726#define PTREGSCALL3(name) \
727ENTRY(ptregs_##name) ; \
728 CFI_STARTPROC; \
729 leal 4(%esp),%eax; \
730 pushl_cfi %eax; \
731 movl PT_EDX(%eax),%ecx; \
732 movl PT_ECX(%eax),%edx; \
733 movl PT_EBX(%eax),%eax; \
734 call sys_##name; \
735 addl $4,%esp; \
736 CFI_ADJUST_CFA_OFFSET -4; \
737 ret; \
738 CFI_ENDPROC; \
739ENDPROC(ptregs_##name)
740
741PTREGSCALL1(iopl)
742PTREGSCALL2(sigaltstack)
743PTREGSCALL0(sigreturn)
744PTREGSCALL0(rt_sigreturn)
745PTREGSCALL2(vm86)
746PTREGSCALL1(vm86old)
747
748.macro FIXUP_ESPFIX_STACK 702.macro FIXUP_ESPFIX_STACK
749/* 703/*
750 * Switch back for ESPFIX stack to the normal zerobased stack 704 * Switch back for ESPFIX stack to the normal zerobased stack
@@ -1066,7 +1020,6 @@ ENTRY(xen_failsafe_callback)
1066 lea 16(%esp),%esp 1020 lea 16(%esp),%esp
1067 CFI_ADJUST_CFA_OFFSET -16 1021 CFI_ADJUST_CFA_OFFSET -16
1068 jz 5f 1022 jz 5f
1069 addl $16,%esp
1070 jmp iret_exc 1023 jmp iret_exc
10715: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ 10245: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
1072 SAVE_ALL 1025 SAVE_ALL
@@ -1093,11 +1046,18 @@ ENTRY(xen_failsafe_callback)
1093 _ASM_EXTABLE(4b,9b) 1046 _ASM_EXTABLE(4b,9b)
1094ENDPROC(xen_failsafe_callback) 1047ENDPROC(xen_failsafe_callback)
1095 1048
1096BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, 1049BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1097 xen_evtchn_do_upcall) 1050 xen_evtchn_do_upcall)
1098 1051
1099#endif /* CONFIG_XEN */ 1052#endif /* CONFIG_XEN */
1100 1053
1054#if IS_ENABLED(CONFIG_HYPERV)
1055
1056BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
1057 hyperv_vector_handler)
1058
1059#endif /* CONFIG_HYPERV */
1060
1101#ifdef CONFIG_FUNCTION_TRACER 1061#ifdef CONFIG_FUNCTION_TRACER
1102#ifdef CONFIG_DYNAMIC_FTRACE 1062#ifdef CONFIG_DYNAMIC_FTRACE
1103 1063
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 70641aff0c25..c1d01e6ca790 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -828,23 +828,6 @@ int_restore_rest:
828 CFI_ENDPROC 828 CFI_ENDPROC
829END(system_call) 829END(system_call)
830 830
831/*
832 * Certain special system calls that need to save a complete full stack frame.
833 */
834 .macro PTREGSCALL label,func,arg
835ENTRY(\label)
836 PARTIAL_FRAME 1 8 /* offset 8: return address */
837 subq $REST_SKIP, %rsp
838 CFI_ADJUST_CFA_OFFSET REST_SKIP
839 call save_rest
840 DEFAULT_FRAME 0 8 /* offset 8: return address */
841 leaq 8(%rsp), \arg /* pt_regs pointer */
842 call \func
843 jmp ptregscall_common
844 CFI_ENDPROC
845END(\label)
846 .endm
847
848 .macro FORK_LIKE func 831 .macro FORK_LIKE func
849ENTRY(stub_\func) 832ENTRY(stub_\func)
850 CFI_STARTPROC 833 CFI_STARTPROC
@@ -861,11 +844,22 @@ ENTRY(stub_\func)
861END(stub_\func) 844END(stub_\func)
862 .endm 845 .endm
863 846
847 .macro FIXED_FRAME label,func
848ENTRY(\label)
849 CFI_STARTPROC
850 PARTIAL_FRAME 0 8 /* offset 8: return address */
851 FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
852 call \func
853 RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
854 ret
855 CFI_ENDPROC
856END(\label)
857 .endm
858
864 FORK_LIKE clone 859 FORK_LIKE clone
865 FORK_LIKE fork 860 FORK_LIKE fork
866 FORK_LIKE vfork 861 FORK_LIKE vfork
867 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx 862 FIXED_FRAME stub_iopl, sys_iopl
868 PTREGSCALL stub_iopl, sys_iopl, %rsi
869 863
870ENTRY(ptregscall_common) 864ENTRY(ptregscall_common)
871 DEFAULT_FRAME 1 8 /* offset 8: return address */ 865 DEFAULT_FRAME 1 8 /* offset 8: return address */
@@ -887,7 +881,6 @@ ENTRY(stub_execve)
887 SAVE_REST 881 SAVE_REST
888 FIXUP_TOP_OF_STACK %r11 882 FIXUP_TOP_OF_STACK %r11
889 call sys_execve 883 call sys_execve
890 RESTORE_TOP_OF_STACK %r11
891 movq %rax,RAX(%rsp) 884 movq %rax,RAX(%rsp)
892 RESTORE_REST 885 RESTORE_REST
893 jmp int_ret_from_sys_call 886 jmp int_ret_from_sys_call
@@ -903,7 +896,6 @@ ENTRY(stub_rt_sigreturn)
903 addq $8, %rsp 896 addq $8, %rsp
904 PARTIAL_FRAME 0 897 PARTIAL_FRAME 0
905 SAVE_REST 898 SAVE_REST
906 movq %rsp,%rdi
907 FIXUP_TOP_OF_STACK %r11 899 FIXUP_TOP_OF_STACK %r11
908 call sys_rt_sigreturn 900 call sys_rt_sigreturn
909 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 901 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
@@ -913,14 +905,11 @@ ENTRY(stub_rt_sigreturn)
913END(stub_rt_sigreturn) 905END(stub_rt_sigreturn)
914 906
915#ifdef CONFIG_X86_X32_ABI 907#ifdef CONFIG_X86_X32_ABI
916 PTREGSCALL stub_x32_sigaltstack, sys32_sigaltstack, %rdx
917
918ENTRY(stub_x32_rt_sigreturn) 908ENTRY(stub_x32_rt_sigreturn)
919 CFI_STARTPROC 909 CFI_STARTPROC
920 addq $8, %rsp 910 addq $8, %rsp
921 PARTIAL_FRAME 0 911 PARTIAL_FRAME 0
922 SAVE_REST 912 SAVE_REST
923 movq %rsp,%rdi
924 FIXUP_TOP_OF_STACK %r11 913 FIXUP_TOP_OF_STACK %r11
925 call sys32_x32_rt_sigreturn 914 call sys32_x32_rt_sigreturn
926 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer 915 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
@@ -1457,11 +1446,16 @@ ENTRY(xen_failsafe_callback)
1457 CFI_ENDPROC 1446 CFI_ENDPROC
1458END(xen_failsafe_callback) 1447END(xen_failsafe_callback)
1459 1448
1460apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ 1449apicinterrupt HYPERVISOR_CALLBACK_VECTOR \
1461 xen_hvm_callback_vector xen_evtchn_do_upcall 1450 xen_hvm_callback_vector xen_evtchn_do_upcall
1462 1451
1463#endif /* CONFIG_XEN */ 1452#endif /* CONFIG_XEN */
1464 1453
1454#if IS_ENABLED(CONFIG_HYPERV)
1455apicinterrupt HYPERVISOR_CALLBACK_VECTOR \
1456 hyperv_callback_vector hyperv_vector_handler
1457#endif /* CONFIG_HYPERV */
1458
1465/* 1459/*
1466 * Some functions should be protected against kprobes 1460 * Some functions should be protected against kprobes
1467 */ 1461 */
@@ -1784,6 +1778,7 @@ first_nmi:
1784 * Leave room for the "copied" frame 1778 * Leave room for the "copied" frame
1785 */ 1779 */
1786 subq $(5*8), %rsp 1780 subq $(5*8), %rsp
1781 CFI_ADJUST_CFA_OFFSET 5*8
1787 1782
1788 /* Copy the stack frame to the Saved frame */ 1783 /* Copy the stack frame to the Saved frame */
1789 .rept 5 1784 .rept 5
@@ -1866,10 +1861,8 @@ end_repeat_nmi:
1866nmi_swapgs: 1861nmi_swapgs:
1867 SWAPGS_UNSAFE_STACK 1862 SWAPGS_UNSAFE_STACK
1868nmi_restore: 1863nmi_restore:
1869 RESTORE_ALL 8 1864 /* Pop the extra iret frame at once */
1870 1865 RESTORE_ALL 6*8
1871 /* Pop the extra iret frame */
1872 addq $(5*8), %rsp
1873 1866
1874 /* Clear the NMI executing stack variable */ 1867 /* Clear the NMI executing stack variable */
1875 movq $0, 5*8(%rsp) 1868 movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1d414029f1d8..42a392a9fd02 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
89 * kernel identity mapping to modify code. 89 * kernel identity mapping to modify code.
90 */ 90 */
91 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 91 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
92 ip = (unsigned long)__va(__pa(ip)); 92 ip = (unsigned long)__va(__pa_symbol(ip));
93 93
94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); 94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
95} 95}
@@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
279 * kernel identity mapping to modify code. 279 * kernel identity mapping to modify code.
280 */ 280 */
281 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 281 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
282 ip = (unsigned long)__va(__pa(ip)); 282 ip = (unsigned long)__va(__pa_symbol(ip));
283 283
284 return probe_kernel_write((void *)ip, val, size); 284 return probe_kernel_write((void *)ip, val, size);
285} 285}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 48d9d4ea1020..992f442ca155 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -5,8 +5,6 @@
5#include <asm/setup.h> 5#include <asm/setup.h>
6#include <asm/bios_ebda.h> 6#include <asm/bios_ebda.h>
7 7
8#define BIOS_LOWMEM_KILOBYTES 0x413
9
10/* 8/*
11 * The BIOS places the EBDA/XBDA at the top of conventional 9 * The BIOS places the EBDA/XBDA at the top of conventional
12 * memory, and usually decreases the reported amount of 10 * memory, and usually decreases the reported amount of
@@ -16,17 +14,30 @@
16 * chipset: reserve a page before VGA to prevent PCI prefetch 14 * chipset: reserve a page before VGA to prevent PCI prefetch
17 * into it (errata #56). Usually the page is reserved anyways, 15 * into it (errata #56). Usually the page is reserved anyways,
18 * unless you have no PS/2 mouse plugged in. 16 * unless you have no PS/2 mouse plugged in.
17 *
18 * This functions is deliberately very conservative. Losing
19 * memory in the bottom megabyte is rarely a problem, as long
20 * as we have enough memory to install the trampoline. Using
21 * memory that is in use by the BIOS or by some DMA device
22 * the BIOS didn't shut down *is* a big problem.
19 */ 23 */
24
25#define BIOS_LOWMEM_KILOBYTES 0x413
26#define LOWMEM_CAP 0x9f000U /* Absolute maximum */
27#define INSANE_CUTOFF 0x20000U /* Less than this = insane */
28
20void __init reserve_ebda_region(void) 29void __init reserve_ebda_region(void)
21{ 30{
22 unsigned int lowmem, ebda_addr; 31 unsigned int lowmem, ebda_addr;
23 32
24 /* To determine the position of the EBDA and the */ 33 /*
25 /* end of conventional memory, we need to look at */ 34 * To determine the position of the EBDA and the
26 /* the BIOS data area. In a paravirtual environment */ 35 * end of conventional memory, we need to look at
27 /* that area is absent. We'll just have to assume */ 36 * the BIOS data area. In a paravirtual environment
28 /* that the paravirt case can handle memory setup */ 37 * that area is absent. We'll just have to assume
29 /* correctly, without our help. */ 38 * that the paravirt case can handle memory setup
39 * correctly, without our help.
40 */
30 if (paravirt_enabled()) 41 if (paravirt_enabled())
31 return; 42 return;
32 43
@@ -37,19 +48,23 @@ void __init reserve_ebda_region(void)
37 /* start of EBDA area */ 48 /* start of EBDA area */
38 ebda_addr = get_bios_ebda(); 49 ebda_addr = get_bios_ebda();
39 50
40 /* Fixup: bios puts an EBDA in the top 64K segment */ 51 /*
41 /* of conventional memory, but does not adjust lowmem. */ 52 * Note: some old Dells seem to need 4k EBDA without
42 if ((lowmem - ebda_addr) <= 0x10000) 53 * reporting so, so just consider the memory above 0x9f000
43 lowmem = ebda_addr; 54 * to be off limits (bugzilla 2990).
55 */
56
57 /* If the EBDA address is below 128K, assume it is bogus */
58 if (ebda_addr < INSANE_CUTOFF)
59 ebda_addr = LOWMEM_CAP;
44 60
45 /* Fixup: bios does not report an EBDA at all. */ 61 /* If lowmem is less than 128K, assume it is bogus */
46 /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ 62 if (lowmem < INSANE_CUTOFF)
47 if ((ebda_addr == 0) && (lowmem >= 0x9f000)) 63 lowmem = LOWMEM_CAP;
48 lowmem = 0x9f000;
49 64
50 /* Paranoia: should never happen, but... */ 65 /* Use the lower of the lowmem and EBDA markers as the cutoff */
51 if ((lowmem == 0) || (lowmem >= 0x100000)) 66 lowmem = min(lowmem, ebda_addr);
52 lowmem = 0x9f000; 67 lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */
53 68
54 /* reserve all memory between lowmem and the 1MB mark */ 69 /* reserve all memory between lowmem and the 1MB mark */
55 memblock_reserve(lowmem, 0x100000 - lowmem); 70 memblock_reserve(lowmem, 0x100000 - lowmem);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index c18f59d10101..138463a24877 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -18,6 +18,7 @@
18#include <asm/io_apic.h> 18#include <asm/io_apic.h>
19#include <asm/bios_ebda.h> 19#include <asm/bios_ebda.h>
20#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
21#include <asm/bootparam_utils.h>
21 22
22static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
23{ 24{
@@ -30,19 +31,7 @@ static void __init i386_default_early_setup(void)
30 31
31void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
32{ 33{
33 memblock_reserve(__pa_symbol(&_text), 34 sanitize_boot_params(&boot_params);
34 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
35
36#ifdef CONFIG_BLK_DEV_INITRD
37 /* Reserve INITRD */
38 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
39 /* Assume only end is not page aligned */
40 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
41 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
42 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
43 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
44 }
45#endif
46 35
47 /* Call the subarch specific early setup function */ 36 /* Call the subarch specific early setup function */
48 switch (boot_params.hdr.hardware_subarch) { 37 switch (boot_params.hdr.hardware_subarch) {
@@ -57,11 +46,5 @@ void __init i386_start_kernel(void)
57 break; 46 break;
58 } 47 }
59 48
60 /*
61 * At this point everything still needed from the boot loader
62 * or BIOS or kernel text should be early reserved or marked not
63 * RAM in e820. All other memory is free game.
64 */
65
66 start_kernel(); 49 start_kernel();
67} 50}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57a99ac..c5e403f6d869 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,12 +25,84 @@
25#include <asm/kdebug.h> 25#include <asm/kdebug.h>
26#include <asm/e820.h> 26#include <asm/e820.h>
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/bootparam_utils.h>
29#include <asm/microcode.h>
28 30
29static void __init zap_identity_mappings(void) 31/*
32 * Manage page tables very early on.
33 */
34extern pgd_t early_level4_pgt[PTRS_PER_PGD];
35extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
36static unsigned int __initdata next_early_pgt = 2;
37
38/* Wipe all early page tables except for the kernel symbol map */
39static void __init reset_early_page_tables(void)
30{ 40{
31 pgd_t *pgd = pgd_offset_k(0UL); 41 unsigned long i;
32 pgd_clear(pgd); 42
33 __flush_tlb_all(); 43 for (i = 0; i < PTRS_PER_PGD-1; i++)
44 early_level4_pgt[i].pgd = 0;
45
46 next_early_pgt = 0;
47
48 write_cr3(__pa(early_level4_pgt));
49}
50
51/* Create a new PMD entry */
52int __init early_make_pgtable(unsigned long address)
53{
54 unsigned long physaddr = address - __PAGE_OFFSET;
55 unsigned long i;
56 pgdval_t pgd, *pgd_p;
57 pudval_t pud, *pud_p;
58 pmdval_t pmd, *pmd_p;
59
60 /* Invalid address or early pgt is done ? */
61 if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
62 return -1;
63
64again:
65 pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
66 pgd = *pgd_p;
67
68 /*
69 * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
70 * critical -- __PAGE_OFFSET would point us back into the dynamic
71 * range and we might end up looping forever...
72 */
73 if (pgd)
74 pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
75 else {
76 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
77 reset_early_page_tables();
78 goto again;
79 }
80
81 pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
82 for (i = 0; i < PTRS_PER_PUD; i++)
83 pud_p[i] = 0;
84 *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
85 }
86 pud_p += pud_index(address);
87 pud = *pud_p;
88
89 if (pud)
90 pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
91 else {
92 if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
93 reset_early_page_tables();
94 goto again;
95 }
96
97 pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
98 for (i = 0; i < PTRS_PER_PMD; i++)
99 pmd_p[i] = 0;
100 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
101 }
102 pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
103 pmd_p[pmd_index(address)] = pmd;
104
105 return 0;
34} 106}
35 107
36/* Don't add a printk in there. printk relies on the PDA which is not initialized 108/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -41,13 +113,25 @@ static void __init clear_bss(void)
41 (unsigned long) __bss_stop - (unsigned long) __bss_start); 113 (unsigned long) __bss_stop - (unsigned long) __bss_start);
42} 114}
43 115
116static unsigned long get_cmd_line_ptr(void)
117{
118 unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
119
120 cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32;
121
122 return cmd_line_ptr;
123}
124
44static void __init copy_bootdata(char *real_mode_data) 125static void __init copy_bootdata(char *real_mode_data)
45{ 126{
46 char * command_line; 127 char * command_line;
128 unsigned long cmd_line_ptr;
47 129
48 memcpy(&boot_params, real_mode_data, sizeof boot_params); 130 memcpy(&boot_params, real_mode_data, sizeof boot_params);
49 if (boot_params.hdr.cmd_line_ptr) { 131 sanitize_boot_params(&boot_params);
50 command_line = __va(boot_params.hdr.cmd_line_ptr); 132 cmd_line_ptr = get_cmd_line_ptr();
133 if (cmd_line_ptr) {
134 command_line = __va(cmd_line_ptr);
51 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 135 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
52 } 136 }
53} 137}
@@ -70,54 +154,40 @@ void __init x86_64_start_kernel(char * real_mode_data)
70 (__START_KERNEL & PGDIR_MASK))); 154 (__START_KERNEL & PGDIR_MASK)));
71 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); 155 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
72 156
157 /* Kill off the identity-map trampoline */
158 reset_early_page_tables();
159
73 /* clear bss before set_intr_gate with early_idt_handler */ 160 /* clear bss before set_intr_gate with early_idt_handler */
74 clear_bss(); 161 clear_bss();
75 162
76 /* Make NULL pointers segfault */ 163 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
77 zap_identity_mappings();
78
79 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
80
81 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
82#ifdef CONFIG_EARLY_PRINTK
83 set_intr_gate(i, &early_idt_handlers[i]); 164 set_intr_gate(i, &early_idt_handlers[i]);
84#else
85 set_intr_gate(i, early_idt_handler);
86#endif
87 }
88 load_idt((const struct desc_ptr *)&idt_descr); 165 load_idt((const struct desc_ptr *)&idt_descr);
89 166
167 copy_bootdata(__va(real_mode_data));
168
169 /*
170 * Load microcode early on BSP.
171 */
172 load_ucode_bsp();
173
90 if (console_loglevel == 10) 174 if (console_loglevel == 10)
91 early_printk("Kernel alive\n"); 175 early_printk("Kernel alive\n");
92 176
177 clear_page(init_level4_pgt);
178 /* set init_level4_pgt kernel high mapping*/
179 init_level4_pgt[511] = early_level4_pgt[511];
180
93 x86_64_start_reservations(real_mode_data); 181 x86_64_start_reservations(real_mode_data);
94} 182}
95 183
96void __init x86_64_start_reservations(char *real_mode_data) 184void __init x86_64_start_reservations(char *real_mode_data)
97{ 185{
98 copy_bootdata(__va(real_mode_data)); 186 /* version is always not zero if it is copied */
99 187 if (!boot_params.hdr.version)
100 memblock_reserve(__pa_symbol(&_text), 188 copy_bootdata(__va(real_mode_data));
101 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
102
103#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */
105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
110 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
111 }
112#endif
113 189
114 reserve_ebda_region(); 190 reserve_ebda_region();
115 191
116 /*
117 * At this point everything still needed from the boot loader
118 * or BIOS or kernel text should be early reserved or marked not
119 * RAM in e820. All other memory is free game.
120 */
121
122 start_kernel(); 192 start_kernel();
123} 193}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8e7f6556028f..73afd11799ca 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -144,6 +144,11 @@ ENTRY(startup_32)
144 movl %eax, pa(olpc_ofw_pgd) 144 movl %eax, pa(olpc_ofw_pgd)
145#endif 145#endif
146 146
147#ifdef CONFIG_MICROCODE_EARLY
148 /* Early load ucode on BSP. */
149 call load_ucode_bsp
150#endif
151
147/* 152/*
148 * Initialize page tables. This creates a PDE and a set of page 153 * Initialize page tables. This creates a PDE and a set of page
149 * tables, which are located immediately beyond __brk_base. The variable 154 * tables, which are located immediately beyond __brk_base. The variable
@@ -299,38 +304,59 @@ ENTRY(startup_32_smp)
299 movl %eax,%ss 304 movl %eax,%ss
300 leal -__PAGE_OFFSET(%ecx),%esp 305 leal -__PAGE_OFFSET(%ecx),%esp
301 306
307#ifdef CONFIG_MICROCODE_EARLY
308 /* Early load ucode on AP. */
309 call load_ucode_ap
310#endif
311
312
302default_entry: 313default_entry:
314#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
315 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
316 X86_CR0_PG)
317 movl $(CR0_STATE & ~X86_CR0_PG),%eax
318 movl %eax,%cr0
319
303/* 320/*
304 * New page tables may be in 4Mbyte page mode and may 321 * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave
305 * be using the global pages. 322 * bits like NT set. This would confuse the debugger if this code is traced. So
323 * initialize them properly now before switching to protected mode. That means
324 * DF in particular (even though we have cleared it earlier after copying the
325 * command line) because GCC expects it.
326 */
327 pushl $0
328 popfl
329
330/*
331 * New page tables may be in 4Mbyte page mode and may be using the global pages.
306 * 332 *
307 * NOTE! If we are on a 486 we may have no cr4 at all! 333 * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists
308 * Specifically, cr4 exists if and only if CPUID exists 334 * if and only if CPUID exists and has flags other than the FPU flag set.
309 * and has flags other than the FPU flag set.
310 */ 335 */
336 movl $-1,pa(X86_CPUID) # preset CPUID level
311 movl $X86_EFLAGS_ID,%ecx 337 movl $X86_EFLAGS_ID,%ecx
312 pushl %ecx 338 pushl %ecx
313 popfl 339 popfl # set EFLAGS=ID
314 pushfl
315 popl %eax
316 pushl $0
317 popfl
318 pushfl 340 pushfl
319 popl %edx 341 popl %eax # get EFLAGS
320 xorl %edx,%eax 342 testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
321 testl %ecx,%eax 343 jz enable_paging # hw disallowed setting of ID bit
322 jz 6f # No ID flag = no CPUID = no CR4 344 # which means no CPUID and no CR4
345
346 xorl %eax,%eax
347 cpuid
348 movl %eax,pa(X86_CPUID) # save largest std CPUID function
323 349
324 movl $1,%eax 350 movl $1,%eax
325 cpuid 351 cpuid
326 andl $~1,%edx # Ignore CPUID.FPU 352 andl $~1,%edx # Ignore CPUID.FPU
327 jz 6f # No flags or only CPUID.FPU = no CR4 353 jz enable_paging # No flags or only CPUID.FPU = no CR4
328 354
329 movl pa(mmu_cr4_features),%eax 355 movl pa(mmu_cr4_features),%eax
330 movl %eax,%cr4 356 movl %eax,%cr4
331 357
332 testb $X86_CR4_PAE, %al # check if PAE is enabled 358 testb $X86_CR4_PAE, %al # check if PAE is enabled
333 jz 6f 359 jz enable_paging
334 360
335 /* Check if extended functions are implemented */ 361 /* Check if extended functions are implemented */
336 movl $0x80000000, %eax 362 movl $0x80000000, %eax
@@ -338,7 +364,7 @@ default_entry:
338 /* Value must be in the range 0x80000001 to 0x8000ffff */ 364 /* Value must be in the range 0x80000001 to 0x8000ffff */
339 subl $0x80000001, %eax 365 subl $0x80000001, %eax
340 cmpl $(0x8000ffff-0x80000001), %eax 366 cmpl $(0x8000ffff-0x80000001), %eax
341 ja 6f 367 ja enable_paging
342 368
343 /* Clear bogus XD_DISABLE bits */ 369 /* Clear bogus XD_DISABLE bits */
344 call verify_cpu 370 call verify_cpu
@@ -347,7 +373,7 @@ default_entry:
347 cpuid 373 cpuid
348 /* Execute Disable bit supported? */ 374 /* Execute Disable bit supported? */
349 btl $(X86_FEATURE_NX & 31), %edx 375 btl $(X86_FEATURE_NX & 31), %edx
350 jnc 6f 376 jnc enable_paging
351 377
352 /* Setup EFER (Extended Feature Enable Register) */ 378 /* Setup EFER (Extended Feature Enable Register) */
353 movl $MSR_EFER, %ecx 379 movl $MSR_EFER, %ecx
@@ -357,15 +383,14 @@ default_entry:
357 /* Make changes effective */ 383 /* Make changes effective */
358 wrmsr 384 wrmsr
359 385
3606: 386enable_paging:
361 387
362/* 388/*
363 * Enable paging 389 * Enable paging
364 */ 390 */
365 movl $pa(initial_page_table), %eax 391 movl $pa(initial_page_table), %eax
366 movl %eax,%cr3 /* set the page table pointer.. */ 392 movl %eax,%cr3 /* set the page table pointer.. */
367 movl %cr0,%eax 393 movl $CR0_STATE,%eax
368 orl $X86_CR0_PG,%eax
369 movl %eax,%cr0 /* ..and set paging (PG) bit */ 394 movl %eax,%cr0 /* ..and set paging (PG) bit */
370 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 395 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
3711: 3961:
@@ -373,14 +398,6 @@ default_entry:
373 addl $__PAGE_OFFSET, %esp 398 addl $__PAGE_OFFSET, %esp
374 399
375/* 400/*
376 * Initialize eflags. Some BIOS's leave bits like NT set. This would
377 * confuse the debugger if this code is traced.
378 * XXX - best to initialize before switching to protected mode.
379 */
380 pushl $0
381 popfl
382
383/*
384 * start system 32-bit setup. We need to re-do some of the things done 401 * start system 32-bit setup. We need to re-do some of the things done
385 * in 16-bit mode for the "real" operations. 402 * in 16-bit mode for the "real" operations.
386 */ 403 */
@@ -389,31 +406,11 @@ default_entry:
389 jz 1f # Did we do this already? 406 jz 1f # Did we do this already?
390 call *%eax 407 call *%eax
3911: 4081:
392 409
393/* check if it is 486 or 386. */
394/* 410/*
395 * XXX - this does a lot of unnecessary setup. Alignment checks don't 411 * Check if it is 486
396 * apply at our cpl of 0 and the stack ought to be aligned already, and
397 * we don't need to preserve eflags.
398 */ 412 */
399 movl $-1,X86_CPUID # -1 for no CPUID initially 413 cmpl $-1,X86_CPUID
400 movb $3,X86 # at least 386
401 pushfl # push EFLAGS
402 popl %eax # get EFLAGS
403 movl %eax,%ecx # save original EFLAGS
404 xorl $0x240000,%eax # flip AC and ID bits in EFLAGS
405 pushl %eax # copy to EFLAGS
406 popfl # set EFLAGS
407 pushfl # get new EFLAGS
408 popl %eax # put it in eax
409 xorl %ecx,%eax # change in flags
410 pushl %ecx # restore original EFLAGS
411 popfl
412 testl $0x40000,%eax # check if AC bit changed
413 je is386
414
415 movb $4,X86 # at least 486
416 testl $0x200000,%eax # check if ID bit changed
417 je is486 414 je is486
418 415
419 /* get vendor info */ 416 /* get vendor info */
@@ -439,11 +436,10 @@ default_entry:
439 movb %cl,X86_MASK 436 movb %cl,X86_MASK
440 movl %edx,X86_CAPABILITY 437 movl %edx,X86_CAPABILITY
441 438
442is486: movl $0x50022,%ecx # set AM, WP, NE and MP 439is486:
443 jmp 2f 440 movb $4,X86
444 441 movl $0x50022,%ecx # set AM, WP, NE and MP
445is386: movl $2,%ecx # set MP 442 movl %cr0,%eax
4462: movl %cr0,%eax
447 andl $0x80000011,%eax # Save PG,PE,ET 443 andl $0x80000011,%eax # Save PG,PE,ET
448 orl %ecx,%eax 444 orl %ecx,%eax
449 movl %eax,%cr0 445 movl %eax,%cr0
@@ -468,7 +464,6 @@ is386: movl $2,%ecx # set MP
468 xorl %eax,%eax # Clear LDT 464 xorl %eax,%eax # Clear LDT
469 lldt %ax 465 lldt %ax
470 466
471 cld # gcc2 wants the direction flag cleared at all times
472 pushl $0 # fake return address for unwinder 467 pushl $0 # fake return address for unwinder
473 jmp *(initial_code) 468 jmp *(initial_code)
474 469
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9cc..6859e9626442 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
47 .code64 47 .code64
48 .globl startup_64 48 .globl startup_64
49startup_64: 49startup_64:
50
51 /* 50 /*
52 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 51 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
53 * and someone has loaded an identity mapped page table 52 * and someone has loaded an identity mapped page table
54 * for us. These identity mapped page tables map all of the 53 * for us. These identity mapped page tables map all of the
55 * kernel pages and possibly all of memory. 54 * kernel pages and possibly all of memory.
56 * 55 *
57 * %esi holds a physical pointer to real_mode_data. 56 * %rsi holds a physical pointer to real_mode_data.
58 * 57 *
59 * We come here either directly from a 64bit bootloader, or from 58 * We come here either directly from a 64bit bootloader, or from
60 * arch/x86_64/boot/compressed/head.S. 59 * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
66 * tables and then reload them. 65 * tables and then reload them.
67 */ 66 */
68 67
69 /* Compute the delta between the address I am compiled to run at and the 68 /*
69 * Compute the delta between the address I am compiled to run at and the
70 * address I am actually running at. 70 * address I am actually running at.
71 */ 71 */
72 leaq _text(%rip), %rbp 72 leaq _text(%rip), %rbp
@@ -78,45 +78,62 @@ startup_64:
78 testl %eax, %eax 78 testl %eax, %eax
79 jnz bad_address 79 jnz bad_address
80 80
81 /* Is the address too large? */ 81 /*
82 leaq _text(%rip), %rdx 82 * Is the address too large?
83 movq $PGDIR_SIZE, %rax
84 cmpq %rax, %rdx
85 jae bad_address
86
87 /* Fixup the physical addresses in the page table
88 */ 83 */
89 addq %rbp, init_level4_pgt + 0(%rip) 84 leaq _text(%rip), %rax
90 addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip) 85 shrq $MAX_PHYSMEM_BITS, %rax
91 addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip) 86 jnz bad_address
92 87
93 addq %rbp, level3_ident_pgt + 0(%rip) 88 /*
89 * Fixup the physical addresses in the page table
90 */
91 addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
94 92
95 addq %rbp, level3_kernel_pgt + (510*8)(%rip) 93 addq %rbp, level3_kernel_pgt + (510*8)(%rip)
96 addq %rbp, level3_kernel_pgt + (511*8)(%rip) 94 addq %rbp, level3_kernel_pgt + (511*8)(%rip)
97 95
98 addq %rbp, level2_fixmap_pgt + (506*8)(%rip) 96 addq %rbp, level2_fixmap_pgt + (506*8)(%rip)
99 97
100 /* Add an Identity mapping if I am above 1G */ 98 /*
99 * Set up the identity mapping for the switchover. These
100 * entries should *NOT* have the global bit set! This also
101 * creates a bunch of nonsense entries but that is fine --
102 * it avoids problems around wraparound.
103 */
101 leaq _text(%rip), %rdi 104 leaq _text(%rip), %rdi
102 andq $PMD_PAGE_MASK, %rdi 105 leaq early_level4_pgt(%rip), %rbx
103 106
104 movq %rdi, %rax 107 movq %rdi, %rax
105 shrq $PUD_SHIFT, %rax 108 shrq $PGDIR_SHIFT, %rax
106 andq $(PTRS_PER_PUD - 1), %rax
107 jz ident_complete
108 109
109 leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx 110 leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
110 leaq level3_ident_pgt(%rip), %rbx 111 movq %rdx, 0(%rbx,%rax,8)
111 movq %rdx, 0(%rbx, %rax, 8) 112 movq %rdx, 8(%rbx,%rax,8)
112 113
114 addq $4096, %rdx
113 movq %rdi, %rax 115 movq %rdi, %rax
114 shrq $PMD_SHIFT, %rax 116 shrq $PUD_SHIFT, %rax
115 andq $(PTRS_PER_PMD - 1), %rax 117 andl $(PTRS_PER_PUD-1), %eax
116 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx 118 movq %rdx, (4096+0)(%rbx,%rax,8)
117 leaq level2_spare_pgt(%rip), %rbx 119 movq %rdx, (4096+8)(%rbx,%rax,8)
118 movq %rdx, 0(%rbx, %rax, 8) 120
119ident_complete: 121 addq $8192, %rbx
122 movq %rdi, %rax
123 shrq $PMD_SHIFT, %rdi
124 addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
125 leaq (_end - 1)(%rip), %rcx
126 shrq $PMD_SHIFT, %rcx
127 subq %rdi, %rcx
128 incl %ecx
129
1301:
131 andq $(PTRS_PER_PMD - 1), %rdi
132 movq %rax, (%rbx,%rdi,8)
133 incq %rdi
134 addq $PMD_SIZE, %rax
135 decl %ecx
136 jnz 1b
120 137
121 /* 138 /*
122 * Fixup the kernel text+data virtual addresses. Note that 139 * Fixup the kernel text+data virtual addresses. Note that
@@ -124,7 +141,6 @@ ident_complete:
124 * cleanup_highmap() fixes this up along with the mappings 141 * cleanup_highmap() fixes this up along with the mappings
125 * beyond _end. 142 * beyond _end.
126 */ 143 */
127
128 leaq level2_kernel_pgt(%rip), %rdi 144 leaq level2_kernel_pgt(%rip), %rdi
129 leaq 4096(%rdi), %r8 145 leaq 4096(%rdi), %r8
130 /* See if it is a valid page table entry */ 146 /* See if it is a valid page table entry */
@@ -139,17 +155,14 @@ ident_complete:
139 /* Fixup phys_base */ 155 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 156 addq %rbp, phys_base(%rip)
141 157
142 /* Due to ENTRY(), sometimes the empty space gets filled with 158 movq $(early_level4_pgt - __START_KERNEL_map), %rax
143 * zeros. Better take a jmp than relying on empty space being 159 jmp 1f
144 * filled with 0x90 (nop)
145 */
146 jmp secondary_startup_64
147ENTRY(secondary_startup_64) 160ENTRY(secondary_startup_64)
148 /* 161 /*
149 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, 162 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
150 * and someone has loaded a mapped page table. 163 * and someone has loaded a mapped page table.
151 * 164 *
152 * %esi holds a physical pointer to real_mode_data. 165 * %rsi holds a physical pointer to real_mode_data.
153 * 166 *
154 * We come here either from startup_64 (using physical addresses) 167 * We come here either from startup_64 (using physical addresses)
155 * or from trampoline.S (using virtual addresses). 168 * or from trampoline.S (using virtual addresses).
@@ -159,12 +172,14 @@ ENTRY(secondary_startup_64)
159 * after the boot processor executes this code. 172 * after the boot processor executes this code.
160 */ 173 */
161 174
175 movq $(init_level4_pgt - __START_KERNEL_map), %rax
1761:
177
162 /* Enable PAE mode and PGE */ 178 /* Enable PAE mode and PGE */
163 movl $(X86_CR4_PAE | X86_CR4_PGE), %eax 179 movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
164 movq %rax, %cr4 180 movq %rcx, %cr4
165 181
166 /* Setup early boot stage 4 level pagetables. */ 182 /* Setup early boot stage 4 level pagetables. */
167 movq $(init_level4_pgt - __START_KERNEL_map), %rax
168 addq phys_base(%rip), %rax 183 addq phys_base(%rip), %rax
169 movq %rax, %cr3 184 movq %rax, %cr3
170 185
@@ -196,7 +211,7 @@ ENTRY(secondary_startup_64)
196 movq %rax, %cr0 211 movq %rax, %cr0
197 212
198 /* Setup a boot time stack */ 213 /* Setup a boot time stack */
199 movq stack_start(%rip),%rsp 214 movq stack_start(%rip), %rsp
200 215
201 /* zero EFLAGS after setting rsp */ 216 /* zero EFLAGS after setting rsp */
202 pushq $0 217 pushq $0
@@ -236,15 +251,33 @@ ENTRY(secondary_startup_64)
236 movl initial_gs+4(%rip),%edx 251 movl initial_gs+4(%rip),%edx
237 wrmsr 252 wrmsr
238 253
239 /* esi is pointer to real mode structure with interesting info. 254 /* rsi is pointer to real mode structure with interesting info.
240 pass it to C */ 255 pass it to C */
241 movl %esi, %edi 256 movq %rsi, %rdi
242 257
243 /* Finally jump to run C code and to be on real kernel address 258 /* Finally jump to run C code and to be on real kernel address
244 * Since we are running on identity-mapped space we have to jump 259 * Since we are running on identity-mapped space we have to jump
245 * to the full 64bit address, this is only possible as indirect 260 * to the full 64bit address, this is only possible as indirect
246 * jump. In addition we need to ensure %cs is set so we make this 261 * jump. In addition we need to ensure %cs is set so we make this
247 * a far return. 262 * a far return.
263 *
264 * Note: do not change to far jump indirect with 64bit offset.
265 *
266 * AMD does not support far jump indirect with 64bit offset.
267 * AMD64 Architecture Programmer's Manual, Volume 3: states only
268 * JMP FAR mem16:16 FF /5 Far jump indirect,
269 * with the target specified by a far pointer in memory.
270 * JMP FAR mem16:32 FF /5 Far jump indirect,
271 * with the target specified by a far pointer in memory.
272 *
273 * Intel64 does support 64bit offset.
274 * Software Developer Manual Vol 2: states:
275 * FF /5 JMP m16:16 Jump far, absolute indirect,
276 * address given in m16:16
277 * FF /5 JMP m16:32 Jump far, absolute indirect,
278 * address given in m16:32.
279 * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
280 * address given in m16:64.
248 */ 281 */
249 movq initial_code(%rip),%rax 282 movq initial_code(%rip),%rax
250 pushq $0 # fake return address to stop unwinder 283 pushq $0 # fake return address to stop unwinder
@@ -270,13 +303,13 @@ ENDPROC(start_cpu0)
270 303
271 /* SMP bootup changes these two */ 304 /* SMP bootup changes these two */
272 __REFDATA 305 __REFDATA
273 .align 8 306 .balign 8
274 ENTRY(initial_code) 307 GLOBAL(initial_code)
275 .quad x86_64_start_kernel 308 .quad x86_64_start_kernel
276 ENTRY(initial_gs) 309 GLOBAL(initial_gs)
277 .quad INIT_PER_CPU_VAR(irq_stack_union) 310 .quad INIT_PER_CPU_VAR(irq_stack_union)
278 311
279 ENTRY(stack_start) 312 GLOBAL(stack_start)
280 .quad init_thread_union+THREAD_SIZE-8 313 .quad init_thread_union+THREAD_SIZE-8
281 .word 0 314 .word 0
282 __FINITDATA 315 __FINITDATA
@@ -284,7 +317,7 @@ ENDPROC(start_cpu0)
284bad_address: 317bad_address:
285 jmp bad_address 318 jmp bad_address
286 319
287 .section ".init.text","ax" 320 __INIT
288 .globl early_idt_handlers 321 .globl early_idt_handlers
289early_idt_handlers: 322early_idt_handlers:
290 # 104(%rsp) %rflags 323 # 104(%rsp) %rflags
@@ -303,6 +336,7 @@ early_idt_handlers:
303 i = i + 1 336 i = i + 1
304 .endr 337 .endr
305 338
339/* This is global to keep gas from relaxing the jumps */
306ENTRY(early_idt_handler) 340ENTRY(early_idt_handler)
307 cld 341 cld
308 342
@@ -321,14 +355,22 @@ ENTRY(early_idt_handler)
321 pushq %r11 # 0(%rsp) 355 pushq %r11 # 0(%rsp)
322 356
323 cmpl $__KERNEL_CS,96(%rsp) 357 cmpl $__KERNEL_CS,96(%rsp)
324 jne 10f 358 jne 11f
359
360 cmpl $14,72(%rsp) # Page fault?
361 jnz 10f
362 GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
363 call early_make_pgtable
364 andl %eax,%eax
365 jz 20f # All good
325 366
36710:
326 leaq 88(%rsp),%rdi # Pointer to %rip 368 leaq 88(%rsp),%rdi # Pointer to %rip
327 call early_fixup_exception 369 call early_fixup_exception
328 andl %eax,%eax 370 andl %eax,%eax
329 jnz 20f # Found an exception entry 371 jnz 20f # Found an exception entry
330 372
33110: 37311:
332#ifdef CONFIG_EARLY_PRINTK 374#ifdef CONFIG_EARLY_PRINTK
333 GET_CR2_INTO(%r9) # can clobber any volatile register if pv 375 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
334 movl 80(%rsp),%r8d # error code 376 movl 80(%rsp),%r8d # error code
@@ -350,7 +392,7 @@ ENTRY(early_idt_handler)
3501: hlt 3921: hlt
351 jmp 1b 393 jmp 1b
352 394
35320: # Exception table entry found 39520: # Exception table entry found or page table generated
354 popq %r11 396 popq %r11
355 popq %r10 397 popq %r10
356 popq %r9 398 popq %r9
@@ -363,6 +405,9 @@ ENTRY(early_idt_handler)
363 addq $16,%rsp # drop vector number and error code 405 addq $16,%rsp # drop vector number and error code
364 decl early_recursion_flag(%rip) 406 decl early_recursion_flag(%rip)
365 INTERRUPT_RETURN 407 INTERRUPT_RETURN
408ENDPROC(early_idt_handler)
409
410 __INITDATA
366 411
367 .balign 4 412 .balign 4
368early_recursion_flag: 413early_recursion_flag:
@@ -374,11 +419,10 @@ early_idt_msg:
374early_idt_ripmsg: 419early_idt_ripmsg:
375 .asciz "RIP %s\n" 420 .asciz "RIP %s\n"
376#endif /* CONFIG_EARLY_PRINTK */ 421#endif /* CONFIG_EARLY_PRINTK */
377 .previous
378 422
379#define NEXT_PAGE(name) \ 423#define NEXT_PAGE(name) \
380 .balign PAGE_SIZE; \ 424 .balign PAGE_SIZE; \
381ENTRY(name) 425GLOBAL(name)
382 426
383/* Automate the creation of 1 to 1 mapping pmd entries */ 427/* Automate the creation of 1 to 1 mapping pmd entries */
384#define PMDS(START, PERM, COUNT) \ 428#define PMDS(START, PERM, COUNT) \
@@ -388,24 +432,37 @@ ENTRY(name)
388 i = i + 1 ; \ 432 i = i + 1 ; \
389 .endr 433 .endr
390 434
435 __INITDATA
436NEXT_PAGE(early_level4_pgt)
437 .fill 511,8,0
438 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
439
440NEXT_PAGE(early_dynamic_pgts)
441 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
442
391 .data 443 .data
392 /* 444
393 * This default setting generates an ident mapping at address 0x100000 445#ifndef CONFIG_XEN
394 * and a mapping for the kernel that precisely maps virtual address
395 * 0xffffffff80000000 to physical address 0x000000. (always using
396 * 2Mbyte large pages provided by PAE mode)
397 */
398NEXT_PAGE(init_level4_pgt) 446NEXT_PAGE(init_level4_pgt)
399 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 447 .fill 512,8,0
400 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 448#else
401 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 449NEXT_PAGE(init_level4_pgt)
402 .org init_level4_pgt + L4_START_KERNEL*8, 0 450 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
451 .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
452 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
453 .org init_level4_pgt + L4_START_KERNEL*8, 0
403 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 454 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
404 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 455 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
405 456
406NEXT_PAGE(level3_ident_pgt) 457NEXT_PAGE(level3_ident_pgt)
407 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 458 .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
408 .fill 511,8,0 459 .fill 511, 8, 0
460NEXT_PAGE(level2_ident_pgt)
461 /* Since I easily can, map the first 1G.
462 * Don't set NX because code runs from these pages.
463 */
464 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
465#endif
409 466
410NEXT_PAGE(level3_kernel_pgt) 467NEXT_PAGE(level3_kernel_pgt)
411 .fill L3_START_KERNEL,8,0 468 .fill L3_START_KERNEL,8,0
@@ -413,21 +470,6 @@ NEXT_PAGE(level3_kernel_pgt)
413 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 470 .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
414 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 471 .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
415 472
416NEXT_PAGE(level2_fixmap_pgt)
417 .fill 506,8,0
418 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
419 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
420 .fill 5,8,0
421
422NEXT_PAGE(level1_fixmap_pgt)
423 .fill 512,8,0
424
425NEXT_PAGE(level2_ident_pgt)
426 /* Since I easily can, map the first 1G.
427 * Don't set NX because code runs from these pages.
428 */
429 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
430
431NEXT_PAGE(level2_kernel_pgt) 473NEXT_PAGE(level2_kernel_pgt)
432 /* 474 /*
433 * 512 MB kernel mapping. We spend a full page on this pagetable 475 * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -442,11 +484,16 @@ NEXT_PAGE(level2_kernel_pgt)
442 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, 484 PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
443 KERNEL_IMAGE_SIZE/PMD_SIZE) 485 KERNEL_IMAGE_SIZE/PMD_SIZE)
444 486
445NEXT_PAGE(level2_spare_pgt) 487NEXT_PAGE(level2_fixmap_pgt)
446 .fill 512, 8, 0 488 .fill 506,8,0
489 .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
490 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
491 .fill 5,8,0
492
493NEXT_PAGE(level1_fixmap_pgt)
494 .fill 512,8,0
447 495
448#undef PMDS 496#undef PMDS
449#undef NEXT_PAGE
450 497
451 .data 498 .data
452 .align 16 499 .align 16
@@ -472,6 +519,5 @@ ENTRY(nmi_idt_table)
472 .skip IDT_ENTRIES * 16 519 .skip IDT_ENTRIES * 16
473 520
474 __PAGE_ALIGNED_BSS 521 __PAGE_ALIGNED_BSS
475 .align PAGE_SIZE 522NEXT_PAGE(empty_zero_page)
476ENTRY(empty_zero_page)
477 .skip PAGE_SIZE 523 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e28670f9a589..da85a8e830a1 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -478,7 +478,7 @@ static int hpet_msi_next_event(unsigned long delta,
478 478
479static int hpet_setup_msi_irq(unsigned int irq) 479static int hpet_setup_msi_irq(unsigned int irq)
480{ 480{
481 if (arch_setup_hpet_msi(irq, hpet_blockid)) { 481 if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
482 destroy_irq(irq); 482 destroy_irq(irq);
483 return -EINVAL; 483 return -EINVAL;
484 } 484 }
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 9c3bd4a2050e..0fa69127209a 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(csum_partial_copy_generic);
26EXPORT_SYMBOL(__get_user_1); 26EXPORT_SYMBOL(__get_user_1);
27EXPORT_SYMBOL(__get_user_2); 27EXPORT_SYMBOL(__get_user_2);
28EXPORT_SYMBOL(__get_user_4); 28EXPORT_SYMBOL(__get_user_4);
29EXPORT_SYMBOL(__get_user_8);
29 30
30EXPORT_SYMBOL(__put_user_1); 31EXPORT_SYMBOL(__put_user_1);
31EXPORT_SYMBOL(__put_user_2); 32EXPORT_SYMBOL(__put_user_2);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8c968974253d..4ddaf66ea35f 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -93,8 +93,9 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
93 * on system-call entry - see also fork() and the signal handling 93 * on system-call entry - see also fork() and the signal handling
94 * code. 94 * code.
95 */ 95 */
96long sys_iopl(unsigned int level, struct pt_regs *regs) 96SYSCALL_DEFINE1(iopl, unsigned int, level)
97{ 97{
98 struct pt_regs *regs = current_pt_regs();
98 unsigned int old = (regs->flags >> 12) & 3; 99 unsigned int old = (regs->flags >> 12) & 3;
99 struct thread_struct *t = &current->thread; 100 struct thread_struct *t = &current->thread;
100 101
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 6e03b0d69138..7dc4e459c2b3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -42,39 +42,6 @@
42 * (these are usually mapped into the 0x30-0xff vector range) 42 * (these are usually mapped into the 0x30-0xff vector range)
43 */ 43 */
44 44
45#ifdef CONFIG_X86_32
46/*
47 * Note that on a 486, we don't want to do a SIGFPE on an irq13
48 * as the irq is unreliable, and exception 16 works correctly
49 * (ie as explained in the intel literature). On a 386, you
50 * can't use exception 16 due to bad IBM design, so we have to
51 * rely on the less exact irq13.
52 *
53 * Careful.. Not only is IRQ13 unreliable, but it is also
54 * leads to races. IBM designers who came up with it should
55 * be shot.
56 */
57
58static irqreturn_t math_error_irq(int cpl, void *dev_id)
59{
60 outb(0, 0xF0);
61 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
62 return IRQ_NONE;
63 math_error(get_irq_regs(), 0, X86_TRAP_MF);
64 return IRQ_HANDLED;
65}
66
67/*
68 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
69 * so allow interrupt sharing.
70 */
71static struct irqaction fpu_irq = {
72 .handler = math_error_irq,
73 .name = "fpu",
74 .flags = IRQF_NO_THREAD,
75};
76#endif
77
78/* 45/*
79 * IRQ2 is cascade interrupt to second interrupt controller 46 * IRQ2 is cascade interrupt to second interrupt controller
80 */ 47 */
@@ -242,13 +209,6 @@ void __init native_init_IRQ(void)
242 setup_irq(2, &irq2); 209 setup_irq(2, &irq2);
243 210
244#ifdef CONFIG_X86_32 211#ifdef CONFIG_X86_32
245 /*
246 * External FPU? Set up irq13 if so, for
247 * original braindamaged IBM FERR coupling.
248 */
249 if (boot_cpu_data.hard_math && !cpu_has_fpu)
250 setup_irq(FPU_IRQ, &fpu_irq);
251
252 irq_ctx_init(smp_processor_id()); 212 irq_ctx_init(smp_processor_id());
253#endif 213#endif
254} 214}
diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile
new file mode 100644
index 000000000000..0d33169cc1a2
--- /dev/null
+++ b/arch/x86/kernel/kprobes/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for kernel probes
3#
4
5obj-$(CONFIG_KPROBES) += core.o
6obj-$(CONFIG_OPTPROBES) += opt.o
7obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes/common.h
index 3230b68ef29a..2e9d4b5af036 100644
--- a/arch/x86/kernel/kprobes-common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -99,4 +99,15 @@ static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsig
99 return addr; 99 return addr;
100} 100}
101#endif 101#endif
102
103#ifdef CONFIG_KPROBES_ON_FTRACE
104extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
105 struct kprobe_ctlblk *kcb);
106#else
107static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
108 struct kprobe_ctlblk *kcb)
109{
110 return 0;
111}
112#endif
102#endif 113#endif
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes/core.c
index 57916c0d3cf6..3f06e6149981 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -58,7 +58,7 @@
58#include <asm/insn.h> 58#include <asm/insn.h>
59#include <asm/debugreg.h> 59#include <asm/debugreg.h>
60 60
61#include "kprobes-common.h" 61#include "common.h"
62 62
63void jprobe_return_end(void); 63void jprobe_return_end(void);
64 64
@@ -78,7 +78,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
78 * Groups, and some special opcodes can not boost. 78 * Groups, and some special opcodes can not boost.
79 * This is non-const and volatile to keep gcc from statically 79 * This is non-const and volatile to keep gcc from statically
80 * optimizing it out, as variable_test_bit makes gcc think only 80 * optimizing it out, as variable_test_bit makes gcc think only
81 * *(unsigned long*) is used. 81 * *(unsigned long*) is used.
82 */ 82 */
83static volatile u32 twobyte_is_boostable[256 / 32] = { 83static volatile u32 twobyte_is_boostable[256 / 32] = {
84 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 84 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
@@ -117,7 +117,7 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
117 struct __arch_relative_insn { 117 struct __arch_relative_insn {
118 u8 op; 118 u8 op;
119 s32 raddr; 119 s32 raddr;
120 } __attribute__((packed)) *insn; 120 } __packed *insn;
121 121
122 insn = (struct __arch_relative_insn *)from; 122 insn = (struct __arch_relative_insn *)from;
123 insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); 123 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
@@ -541,23 +541,6 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
541 return 1; 541 return 1;
542} 542}
543 543
544#ifdef KPROBES_CAN_USE_FTRACE
545static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
546 struct kprobe_ctlblk *kcb)
547{
548 /*
549 * Emulate singlestep (and also recover regs->ip)
550 * as if there is a 5byte nop
551 */
552 regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
553 if (unlikely(p->post_handler)) {
554 kcb->kprobe_status = KPROBE_HIT_SSDONE;
555 p->post_handler(p, regs, 0);
556 }
557 __this_cpu_write(current_kprobe, NULL);
558}
559#endif
560
561/* 544/*
562 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 545 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
563 * remain disabled throughout this function. 546 * remain disabled throughout this function.
@@ -616,13 +599,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
616 } else if (kprobe_running()) { 599 } else if (kprobe_running()) {
617 p = __this_cpu_read(current_kprobe); 600 p = __this_cpu_read(current_kprobe);
618 if (p->break_handler && p->break_handler(p, regs)) { 601 if (p->break_handler && p->break_handler(p, regs)) {
619#ifdef KPROBES_CAN_USE_FTRACE 602 if (!skip_singlestep(p, regs, kcb))
620 if (kprobe_ftrace(p)) { 603 setup_singlestep(p, regs, kcb, 0);
621 skip_singlestep(p, regs, kcb);
622 return 1;
623 }
624#endif
625 setup_singlestep(p, regs, kcb, 0);
626 return 1; 604 return 1;
627 } 605 }
628 } /* else: not a kprobe fault; let the kernel handle it */ 606 } /* else: not a kprobe fault; let the kernel handle it */
@@ -674,7 +652,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
674{ 652{
675 struct kretprobe_instance *ri = NULL; 653 struct kretprobe_instance *ri = NULL;
676 struct hlist_head *head, empty_rp; 654 struct hlist_head *head, empty_rp;
677 struct hlist_node *node, *tmp; 655 struct hlist_node *tmp;
678 unsigned long flags, orig_ret_address = 0; 656 unsigned long flags, orig_ret_address = 0;
679 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 657 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
680 kprobe_opcode_t *correct_ret_addr = NULL; 658 kprobe_opcode_t *correct_ret_addr = NULL;
@@ -704,7 +682,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
704 * will be the real return address, and all the rest will 682 * will be the real return address, and all the rest will
705 * point to kretprobe_trampoline. 683 * point to kretprobe_trampoline.
706 */ 684 */
707 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 685 hlist_for_each_entry_safe(ri, tmp, head, hlist) {
708 if (ri->task != current) 686 if (ri->task != current)
709 /* another task is sharing our hash bucket */ 687 /* another task is sharing our hash bucket */
710 continue; 688 continue;
@@ -723,7 +701,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
723 kretprobe_assert(ri, orig_ret_address, trampoline_address); 701 kretprobe_assert(ri, orig_ret_address, trampoline_address);
724 702
725 correct_ret_addr = ri->ret_addr; 703 correct_ret_addr = ri->ret_addr;
726 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 704 hlist_for_each_entry_safe(ri, tmp, head, hlist) {
727 if (ri->task != current) 705 if (ri->task != current)
728 /* another task is sharing our hash bucket */ 706 /* another task is sharing our hash bucket */
729 continue; 707 continue;
@@ -750,7 +728,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
750 728
751 kretprobe_hash_unlock(current, &flags); 729 kretprobe_hash_unlock(current, &flags);
752 730
753 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 731 hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
754 hlist_del(&ri->hlist); 732 hlist_del(&ri->hlist);
755 kfree(ri); 733 kfree(ri);
756 } 734 }
@@ -1075,50 +1053,6 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1075 return 0; 1053 return 0;
1076} 1054}
1077 1055
1078#ifdef KPROBES_CAN_USE_FTRACE
1079/* Ftrace callback handler for kprobes */
1080void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
1081 struct ftrace_ops *ops, struct pt_regs *regs)
1082{
1083 struct kprobe *p;
1084 struct kprobe_ctlblk *kcb;
1085 unsigned long flags;
1086
1087 /* Disable irq for emulating a breakpoint and avoiding preempt */
1088 local_irq_save(flags);
1089
1090 p = get_kprobe((kprobe_opcode_t *)ip);
1091 if (unlikely(!p) || kprobe_disabled(p))
1092 goto end;
1093
1094 kcb = get_kprobe_ctlblk();
1095 if (kprobe_running()) {
1096 kprobes_inc_nmissed_count(p);
1097 } else {
1098 /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
1099 regs->ip = ip + sizeof(kprobe_opcode_t);
1100
1101 __this_cpu_write(current_kprobe, p);
1102 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1103 if (!p->pre_handler || !p->pre_handler(p, regs))
1104 skip_singlestep(p, regs, kcb);
1105 /*
1106 * If pre_handler returns !0, it sets regs->ip and
1107 * resets current kprobe.
1108 */
1109 }
1110end:
1111 local_irq_restore(flags);
1112}
1113
1114int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
1115{
1116 p->ainsn.insn = NULL;
1117 p->ainsn.boostable = -1;
1118 return 0;
1119}
1120#endif
1121
1122int __init arch_init_kprobes(void) 1056int __init arch_init_kprobes(void)
1123{ 1057{
1124 return arch_init_optprobes(); 1058 return arch_init_optprobes();
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
new file mode 100644
index 000000000000..23ef5c556f06
--- /dev/null
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -0,0 +1,93 @@
1/*
2 * Dynamic Ftrace based Kprobes Optimization
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) Hitachi Ltd., 2012
19 */
20#include <linux/kprobes.h>
21#include <linux/ptrace.h>
22#include <linux/hardirq.h>
23#include <linux/preempt.h>
24#include <linux/ftrace.h>
25
26#include "common.h"
27
28static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
29 struct kprobe_ctlblk *kcb)
30{
31 /*
32 * Emulate singlestep (and also recover regs->ip)
33 * as if there is a 5byte nop
34 */
35 regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
36 if (unlikely(p->post_handler)) {
37 kcb->kprobe_status = KPROBE_HIT_SSDONE;
38 p->post_handler(p, regs, 0);
39 }
40 __this_cpu_write(current_kprobe, NULL);
41 return 1;
42}
43
44int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
45 struct kprobe_ctlblk *kcb)
46{
47 if (kprobe_ftrace(p))
48 return __skip_singlestep(p, regs, kcb);
49 else
50 return 0;
51}
52
53/* Ftrace callback handler for kprobes */
54void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
55 struct ftrace_ops *ops, struct pt_regs *regs)
56{
57 struct kprobe *p;
58 struct kprobe_ctlblk *kcb;
59 unsigned long flags;
60
61 /* Disable irq for emulating a breakpoint and avoiding preempt */
62 local_irq_save(flags);
63
64 p = get_kprobe((kprobe_opcode_t *)ip);
65 if (unlikely(!p) || kprobe_disabled(p))
66 goto end;
67
68 kcb = get_kprobe_ctlblk();
69 if (kprobe_running()) {
70 kprobes_inc_nmissed_count(p);
71 } else {
72 /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
73 regs->ip = ip + sizeof(kprobe_opcode_t);
74
75 __this_cpu_write(current_kprobe, p);
76 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
77 if (!p->pre_handler || !p->pre_handler(p, regs))
78 __skip_singlestep(p, regs, kcb);
79 /*
80 * If pre_handler returns !0, it sets regs->ip and
81 * resets current kprobe.
82 */
83 }
84end:
85 local_irq_restore(flags);
86}
87
88int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
89{
90 p->ainsn.insn = NULL;
91 p->ainsn.boostable = -1;
92 return 0;
93}
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes/opt.c
index c5e410eed403..76dc6f095724 100644
--- a/arch/x86/kernel/kprobes-opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -37,7 +37,7 @@
37#include <asm/insn.h> 37#include <asm/insn.h>
38#include <asm/debugreg.h> 38#include <asm/debugreg.h>
39 39
40#include "kprobes-common.h" 40#include "common.h"
41 41
42unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) 42unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
43{ 43{
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 08b973f64032..b686a904d7c3 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -43,6 +43,7 @@
43#include <asm/apicdef.h> 43#include <asm/apicdef.h>
44#include <asm/hypervisor.h> 44#include <asm/hypervisor.h>
45#include <asm/kvm_guest.h> 45#include <asm/kvm_guest.h>
46#include <asm/context_tracking.h>
46 47
47static int kvmapf = 1; 48static int kvmapf = 1;
48 49
@@ -121,6 +122,8 @@ void kvm_async_pf_task_wait(u32 token)
121 struct kvm_task_sleep_node n, *e; 122 struct kvm_task_sleep_node n, *e;
122 DEFINE_WAIT(wait); 123 DEFINE_WAIT(wait);
123 124
125 rcu_irq_enter();
126
124 spin_lock(&b->lock); 127 spin_lock(&b->lock);
125 e = _find_apf_task(b, token); 128 e = _find_apf_task(b, token);
126 if (e) { 129 if (e) {
@@ -128,6 +131,8 @@ void kvm_async_pf_task_wait(u32 token)
128 hlist_del(&e->link); 131 hlist_del(&e->link);
129 kfree(e); 132 kfree(e);
130 spin_unlock(&b->lock); 133 spin_unlock(&b->lock);
134
135 rcu_irq_exit();
131 return; 136 return;
132 } 137 }
133 138
@@ -152,13 +157,16 @@ void kvm_async_pf_task_wait(u32 token)
152 /* 157 /*
153 * We cannot reschedule. So halt. 158 * We cannot reschedule. So halt.
154 */ 159 */
160 rcu_irq_exit();
155 native_safe_halt(); 161 native_safe_halt();
162 rcu_irq_enter();
156 local_irq_disable(); 163 local_irq_disable();
157 } 164 }
158 } 165 }
159 if (!n.halted) 166 if (!n.halted)
160 finish_wait(&n.wq, &wait); 167 finish_wait(&n.wq, &wait);
161 168
169 rcu_irq_exit();
162 return; 170 return;
163} 171}
164EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 172EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
@@ -252,10 +260,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
252 break; 260 break;
253 case KVM_PV_REASON_PAGE_NOT_PRESENT: 261 case KVM_PV_REASON_PAGE_NOT_PRESENT:
254 /* page is swapped out by the host. */ 262 /* page is swapped out by the host. */
255 rcu_irq_enter(); 263 exception_enter(regs);
256 exit_idle(); 264 exit_idle();
257 kvm_async_pf_task_wait((u32)read_cr2()); 265 kvm_async_pf_task_wait((u32)read_cr2());
258 rcu_irq_exit(); 266 exception_exit(regs);
259 break; 267 break;
260 case KVM_PV_REASON_PAGE_READY: 268 case KVM_PV_REASON_PAGE_READY:
261 rcu_irq_enter(); 269 rcu_irq_enter();
@@ -289,9 +297,9 @@ static void kvm_register_steal_time(void)
289 297
290 memset(st, 0, sizeof(*st)); 298 memset(st, 0, sizeof(*st));
291 299
292 wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); 300 wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
293 printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", 301 pr_info("kvm-stealtime: cpu %d, msr %llx\n",
294 cpu, __pa(st)); 302 cpu, (unsigned long long) slow_virt_to_phys(st));
295} 303}
296 304
297static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; 305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
@@ -316,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void)
316 return; 324 return;
317 325
318 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 326 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
319 u64 pa = __pa(&__get_cpu_var(apf_reason)); 327 u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
320 328
321#ifdef CONFIG_PREEMPT 329#ifdef CONFIG_PREEMPT
322 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 330 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
@@ -332,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void)
332 /* Size alignment is implied but just to make it explicit. */ 340 /* Size alignment is implied but just to make it explicit. */
333 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); 341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
334 __get_cpu_var(kvm_apic_eoi) = 0; 342 __get_cpu_var(kvm_apic_eoi) = 0;
335 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; 343 pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
344 | KVM_MSR_ENABLED;
336 wrmsrl(MSR_KVM_PV_EOI_EN, pa); 345 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
337 } 346 }
338 347
@@ -497,6 +506,7 @@ static bool __init kvm_detect(void)
497const struct hypervisor_x86 x86_hyper_kvm __refconst = { 506const struct hypervisor_x86 x86_hyper_kvm __refconst = {
498 .name = "KVM", 507 .name = "KVM",
499 .detect = kvm_detect, 508 .detect = kvm_detect,
509 .x2apic_available = kvm_para_available,
500}; 510};
501EXPORT_SYMBOL_GPL(x86_hyper_kvm); 511EXPORT_SYMBOL_GPL(x86_hyper_kvm);
502 512
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index b730efad6fe9..d2c381280e3c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -166,8 +166,8 @@ int kvm_register_clock(char *txt)
166 return 0; 166 return 0;
167 167
168 src = &hv_clock[cpu].pvti; 168 src = &hv_clock[cpu].pvti;
169 low = (int)__pa(src) | 1; 169 low = (int)slow_virt_to_phys(src) | 1;
170 high = ((u64)__pa(src) >> 32); 170 high = ((u64)slow_virt_to_phys(src) >> 32);
171 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 171 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
172 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 172 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
173 cpu, high, low, txt); 173 cpu, high, low, txt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db39db6..4eabc160696f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,125 +16,12 @@
16#include <linux/io.h> 16#include <linux/io.h>
17#include <linux/suspend.h> 17#include <linux/suspend.h>
18 18
19#include <asm/init.h>
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/mmu_context.h> 22#include <asm/mmu_context.h>
22#include <asm/debugreg.h> 23#include <asm/debugreg.h>
23 24
24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
25 unsigned long addr)
26{
27 pud_t *pud;
28 pmd_t *pmd;
29 struct page *page;
30 int result = -ENOMEM;
31
32 addr &= PMD_MASK;
33 pgd += pgd_index(addr);
34 if (!pgd_present(*pgd)) {
35 page = kimage_alloc_control_pages(image, 0);
36 if (!page)
37 goto out;
38 pud = (pud_t *)page_address(page);
39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 }
42 pud = pud_offset(pgd, addr);
43 if (!pud_present(*pud)) {
44 page = kimage_alloc_control_pages(image, 0);
45 if (!page)
46 goto out;
47 pmd = (pmd_t *)page_address(page);
48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 }
51 pmd = pmd_offset(pud, addr);
52 if (!pmd_present(*pmd))
53 set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
54 result = 0;
55out:
56 return result;
57}
58
59static void init_level2_page(pmd_t *level2p, unsigned long addr)
60{
61 unsigned long end_addr;
62
63 addr &= PAGE_MASK;
64 end_addr = addr + PUD_SIZE;
65 while (addr < end_addr) {
66 set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
67 addr += PMD_SIZE;
68 }
69}
70
71static int init_level3_page(struct kimage *image, pud_t *level3p,
72 unsigned long addr, unsigned long last_addr)
73{
74 unsigned long end_addr;
75 int result;
76
77 result = 0;
78 addr &= PAGE_MASK;
79 end_addr = addr + PGDIR_SIZE;
80 while ((addr < last_addr) && (addr < end_addr)) {
81 struct page *page;
82 pmd_t *level2p;
83
84 page = kimage_alloc_control_pages(image, 0);
85 if (!page) {
86 result = -ENOMEM;
87 goto out;
88 }
89 level2p = (pmd_t *)page_address(page);
90 init_level2_page(level2p, addr);
91 set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
92 addr += PUD_SIZE;
93 }
94 /* clear the unused entries */
95 while (addr < end_addr) {
96 pud_clear(level3p++);
97 addr += PUD_SIZE;
98 }
99out:
100 return result;
101}
102
103
104static int init_level4_page(struct kimage *image, pgd_t *level4p,
105 unsigned long addr, unsigned long last_addr)
106{
107 unsigned long end_addr;
108 int result;
109
110 result = 0;
111 addr &= PAGE_MASK;
112 end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
113 while ((addr < last_addr) && (addr < end_addr)) {
114 struct page *page;
115 pud_t *level3p;
116
117 page = kimage_alloc_control_pages(image, 0);
118 if (!page) {
119 result = -ENOMEM;
120 goto out;
121 }
122 level3p = (pud_t *)page_address(page);
123 result = init_level3_page(image, level3p, addr, last_addr);
124 if (result)
125 goto out;
126 set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
127 addr += PGDIR_SIZE;
128 }
129 /* clear the unused entries */
130 while (addr < end_addr) {
131 pgd_clear(level4p++);
132 addr += PGDIR_SIZE;
133 }
134out:
135 return result;
136}
137
138static void free_transition_pgtable(struct kimage *image) 25static void free_transition_pgtable(struct kimage *image)
139{ 26{
140 free_page((unsigned long)image->arch.pud); 27 free_page((unsigned long)image->arch.pud);
@@ -184,22 +71,62 @@ err:
184 return result; 71 return result;
185} 72}
186 73
74static void *alloc_pgt_page(void *data)
75{
76 struct kimage *image = (struct kimage *)data;
77 struct page *page;
78 void *p = NULL;
79
80 page = kimage_alloc_control_pages(image, 0);
81 if (page) {
82 p = page_address(page);
83 clear_page(p);
84 }
85
86 return p;
87}
187 88
188static int init_pgtable(struct kimage *image, unsigned long start_pgtable) 89static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
189{ 90{
91 struct x86_mapping_info info = {
92 .alloc_pgt_page = alloc_pgt_page,
93 .context = image,
94 .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
95 };
96 unsigned long mstart, mend;
190 pgd_t *level4p; 97 pgd_t *level4p;
191 int result; 98 int result;
99 int i;
100
192 level4p = (pgd_t *)__va(start_pgtable); 101 level4p = (pgd_t *)__va(start_pgtable);
193 result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); 102 clear_page(level4p);
194 if (result) 103 for (i = 0; i < nr_pfn_mapped; i++) {
195 return result; 104 mstart = pfn_mapped[i].start << PAGE_SHIFT;
105 mend = pfn_mapped[i].end << PAGE_SHIFT;
106
107 result = kernel_ident_mapping_init(&info,
108 level4p, mstart, mend);
109 if (result)
110 return result;
111 }
112
196 /* 113 /*
197 * image->start may be outside 0 ~ max_pfn, for example when 114 * segments's mem ranges could be outside 0 ~ max_pfn,
198 * jump back to original kernel from kexeced kernel 115 * for example when jump back to original kernel from kexeced kernel.
116 * or first kernel is booted with user mem map, and second kernel
117 * could be loaded out of that range.
199 */ 118 */
200 result = init_one_level2_page(image, level4p, image->start); 119 for (i = 0; i < image->nr_segments; i++) {
201 if (result) 120 mstart = image->segment[i].mem;
202 return result; 121 mend = mstart + image->segment[i].memsz;
122
123 result = kernel_ident_mapping_init(&info,
124 level4p, mstart, mend);
125
126 if (result)
127 return result;
128 }
129
203 return init_transition_pgtable(image, level4p); 130 return init_transition_pgtable(image, level4p);
204} 131}
205 132
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 3a04b224d0c0..22db92bbdf1a 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -364,10 +364,7 @@ static struct attribute_group mc_attr_group = {
364 364
365static void microcode_fini_cpu(int cpu) 365static void microcode_fini_cpu(int cpu)
366{ 366{
367 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
368
369 microcode_ops->microcode_fini_cpu(cpu); 367 microcode_ops->microcode_fini_cpu(cpu);
370 uci->valid = 0;
371} 368}
372 369
373static enum ucode_state microcode_resume_cpu(int cpu) 370static enum ucode_state microcode_resume_cpu(int cpu)
@@ -383,6 +380,10 @@ static enum ucode_state microcode_resume_cpu(int cpu)
383static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) 380static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
384{ 381{
385 enum ucode_state ustate; 382 enum ucode_state ustate;
383 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
384
385 if (uci && uci->valid)
386 return UCODE_OK;
386 387
387 if (collect_cpu_info(cpu)) 388 if (collect_cpu_info(cpu))
388 return UCODE_ERROR; 389 return UCODE_ERROR;
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/microcode_core_early.c
new file mode 100644
index 000000000000..577db8417d15
--- /dev/null
+++ b/arch/x86/kernel/microcode_core_early.c
@@ -0,0 +1,76 @@
1/*
2 * X86 CPU microcode early update for Linux
3 *
4 * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
5 * H Peter Anvin" <hpa@zytor.com>
6 *
7 * This driver allows to early upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
12 * Software Developer's Manual.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 */
19#include <linux/module.h>
20#include <asm/microcode_intel.h>
21#include <asm/processor.h>
22
23#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
24#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
25#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
26#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
27#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
28#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
29#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
30
31#define CPUID_IS(a, b, c, ebx, ecx, edx) \
32 (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
33
34/*
35 * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
36 * x86_vendor() gets vendor id for BSP.
37 *
38 * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
39 * coding, we still use x86_vendor() to get vendor id for AP.
40 *
41 * x86_vendor() gets vendor information directly through cpuid.
42 */
43static int __cpuinit x86_vendor(void)
44{
45 u32 eax = 0x00000000;
46 u32 ebx, ecx = 0, edx;
47
48 if (!have_cpuid_p())
49 return X86_VENDOR_UNKNOWN;
50
51 native_cpuid(&eax, &ebx, &ecx, &edx);
52
53 if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
54 return X86_VENDOR_INTEL;
55
56 if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
57 return X86_VENDOR_AMD;
58
59 return X86_VENDOR_UNKNOWN;
60}
61
62void __init load_ucode_bsp(void)
63{
64 int vendor = x86_vendor();
65
66 if (vendor == X86_VENDOR_INTEL)
67 load_ucode_intel_bsp();
68}
69
70void __cpuinit load_ucode_ap(void)
71{
72 int vendor = x86_vendor();
73
74 if (vendor == X86_VENDOR_INTEL)
75 load_ucode_intel_ap();
76}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 3544aed39338..5fb2cebf556b 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -79,7 +79,7 @@
79#include <linux/module.h> 79#include <linux/module.h>
80#include <linux/vmalloc.h> 80#include <linux/vmalloc.h>
81 81
82#include <asm/microcode.h> 82#include <asm/microcode_intel.h>
83#include <asm/processor.h> 83#include <asm/processor.h>
84#include <asm/msr.h> 84#include <asm/msr.h>
85 85
@@ -87,59 +87,6 @@ MODULE_DESCRIPTION("Microcode Update Driver");
87MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 87MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
88MODULE_LICENSE("GPL"); 88MODULE_LICENSE("GPL");
89 89
90struct microcode_header_intel {
91 unsigned int hdrver;
92 unsigned int rev;
93 unsigned int date;
94 unsigned int sig;
95 unsigned int cksum;
96 unsigned int ldrver;
97 unsigned int pf;
98 unsigned int datasize;
99 unsigned int totalsize;
100 unsigned int reserved[3];
101};
102
103struct microcode_intel {
104 struct microcode_header_intel hdr;
105 unsigned int bits[0];
106};
107
108/* microcode format is extended from prescott processors */
109struct extended_signature {
110 unsigned int sig;
111 unsigned int pf;
112 unsigned int cksum;
113};
114
115struct extended_sigtable {
116 unsigned int count;
117 unsigned int cksum;
118 unsigned int reserved[3];
119 struct extended_signature sigs[0];
120};
121
122#define DEFAULT_UCODE_DATASIZE (2000)
123#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
124#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
125#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
126#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
127#define DWSIZE (sizeof(u32))
128
129#define get_totalsize(mc) \
130 (((struct microcode_intel *)mc)->hdr.totalsize ? \
131 ((struct microcode_intel *)mc)->hdr.totalsize : \
132 DEFAULT_UCODE_TOTALSIZE)
133
134#define get_datasize(mc) \
135 (((struct microcode_intel *)mc)->hdr.datasize ? \
136 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
137
138#define sigmatch(s1, s2, p1, p2) \
139 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
140
141#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
142
143static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 90static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
144{ 91{
145 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 92 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
@@ -162,128 +109,25 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
162 return 0; 109 return 0;
163} 110}
164 111
165static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
166{
167 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
168}
169
170static inline int
171update_match_revision(struct microcode_header_intel *mc_header, int rev)
172{
173 return (mc_header->rev <= rev) ? 0 : 1;
174}
175
176static int microcode_sanity_check(void *mc)
177{
178 unsigned long total_size, data_size, ext_table_size;
179 struct microcode_header_intel *mc_header = mc;
180 struct extended_sigtable *ext_header = NULL;
181 int sum, orig_sum, ext_sigcount = 0, i;
182 struct extended_signature *ext_sig;
183
184 total_size = get_totalsize(mc_header);
185 data_size = get_datasize(mc_header);
186
187 if (data_size + MC_HEADER_SIZE > total_size) {
188 pr_err("error! Bad data size in microcode data file\n");
189 return -EINVAL;
190 }
191
192 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
193 pr_err("error! Unknown microcode update format\n");
194 return -EINVAL;
195 }
196 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
197 if (ext_table_size) {
198 if ((ext_table_size < EXT_HEADER_SIZE)
199 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
200 pr_err("error! Small exttable size in microcode data file\n");
201 return -EINVAL;
202 }
203 ext_header = mc + MC_HEADER_SIZE + data_size;
204 if (ext_table_size != exttable_size(ext_header)) {
205 pr_err("error! Bad exttable size in microcode data file\n");
206 return -EFAULT;
207 }
208 ext_sigcount = ext_header->count;
209 }
210
211 /* check extended table checksum */
212 if (ext_table_size) {
213 int ext_table_sum = 0;
214 int *ext_tablep = (int *)ext_header;
215
216 i = ext_table_size / DWSIZE;
217 while (i--)
218 ext_table_sum += ext_tablep[i];
219 if (ext_table_sum) {
220 pr_warning("aborting, bad extended signature table checksum\n");
221 return -EINVAL;
222 }
223 }
224
225 /* calculate the checksum */
226 orig_sum = 0;
227 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
228 while (i--)
229 orig_sum += ((int *)mc)[i];
230 if (orig_sum) {
231 pr_err("aborting, bad checksum\n");
232 return -EINVAL;
233 }
234 if (!ext_table_size)
235 return 0;
236 /* check extended signature checksum */
237 for (i = 0; i < ext_sigcount; i++) {
238 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
239 EXT_SIGNATURE_SIZE * i;
240 sum = orig_sum
241 - (mc_header->sig + mc_header->pf + mc_header->cksum)
242 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
243 if (sum) {
244 pr_err("aborting, bad checksum\n");
245 return -EINVAL;
246 }
247 }
248 return 0;
249}
250
251/* 112/*
252 * return 0 - no update found 113 * return 0 - no update found
253 * return 1 - found update 114 * return 1 - found update
254 */ 115 */
255static int 116static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
256get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
257{ 117{
258 struct microcode_header_intel *mc_header = mc; 118 struct cpu_signature cpu_sig;
259 struct extended_sigtable *ext_header; 119 unsigned int csig, cpf, crev;
260 unsigned long total_size = get_totalsize(mc_header);
261 int ext_sigcount, i;
262 struct extended_signature *ext_sig;
263
264 if (!update_match_revision(mc_header, rev))
265 return 0;
266
267 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
268 return 1;
269 120
270 /* Look for ext. headers: */ 121 collect_cpu_info(cpu, &cpu_sig);
271 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
272 return 0;
273 122
274 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; 123 csig = cpu_sig.sig;
275 ext_sigcount = ext_header->count; 124 cpf = cpu_sig.pf;
276 ext_sig = (void *)ext_header + EXT_HEADER_SIZE; 125 crev = cpu_sig.rev;
277 126
278 for (i = 0; i < ext_sigcount; i++) { 127 return get_matching_microcode(csig, cpf, mc_intel, crev);
279 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
280 return 1;
281 ext_sig++;
282 }
283 return 0;
284} 128}
285 129
286static int apply_microcode(int cpu) 130int apply_microcode(int cpu)
287{ 131{
288 struct microcode_intel *mc_intel; 132 struct microcode_intel *mc_intel;
289 struct ucode_cpu_info *uci; 133 struct ucode_cpu_info *uci;
@@ -300,6 +144,14 @@ static int apply_microcode(int cpu)
300 if (mc_intel == NULL) 144 if (mc_intel == NULL)
301 return 0; 145 return 0;
302 146
147 /*
148 * Microcode on this CPU could be updated earlier. Only apply the
149 * microcode patch in mc_intel when it is newer than the one on this
150 * CPU.
151 */
152 if (get_matching_mc(mc_intel, cpu) == 0)
153 return 0;
154
303 /* write microcode via MSR 0x79 */ 155 /* write microcode via MSR 0x79 */
304 wrmsr(MSR_IA32_UCODE_WRITE, 156 wrmsr(MSR_IA32_UCODE_WRITE,
305 (unsigned long) mc_intel->bits, 157 (unsigned long) mc_intel->bits,
@@ -338,6 +190,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
338 unsigned int leftover = size; 190 unsigned int leftover = size;
339 enum ucode_state state = UCODE_OK; 191 enum ucode_state state = UCODE_OK;
340 unsigned int curr_mc_size = 0; 192 unsigned int curr_mc_size = 0;
193 unsigned int csig, cpf;
341 194
342 while (leftover) { 195 while (leftover) {
343 struct microcode_header_intel mc_header; 196 struct microcode_header_intel mc_header;
@@ -362,11 +215,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 } 215 }
363 216
364 if (get_ucode_data(mc, ucode_ptr, mc_size) || 217 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
365 microcode_sanity_check(mc) < 0) { 218 microcode_sanity_check(mc, 1) < 0) {
366 break; 219 break;
367 } 220 }
368 221
369 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { 222 csig = uci->cpu_sig.sig;
223 cpf = uci->cpu_sig.pf;
224 if (get_matching_microcode(csig, cpf, mc, new_rev)) {
370 vfree(new_mc); 225 vfree(new_mc);
371 new_rev = mc_header.rev; 226 new_rev = mc_header.rev;
372 new_mc = mc; 227 new_mc = mc;
@@ -393,6 +248,13 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
393 vfree(uci->mc); 248 vfree(uci->mc);
394 uci->mc = (struct microcode_intel *)new_mc; 249 uci->mc = (struct microcode_intel *)new_mc;
395 250
251 /*
252 * If early loading microcode is supported, save this mc into
253 * permanent memory. So it will be loaded early when a CPU is hot added
254 * or resumes.
255 */
256 save_mc_for_early(new_mc);
257
396 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", 258 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
397 cpu, new_rev, uci->cpu_sig.rev); 259 cpu, new_rev, uci->cpu_sig.rev);
398out: 260out:
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/microcode_intel_early.c
new file mode 100644
index 000000000000..7890bc838952
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel_early.c
@@ -0,0 +1,796 @@
1/*
2 * Intel CPU microcode early update for Linux
3 *
4 * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
5 * H Peter Anvin" <hpa@zytor.com>
6 *
7 * This allows to early upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture
12 * Software Developer's Manual.
13 *
14 * This program is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU General Public License
16 * as published by the Free Software Foundation; either version
17 * 2 of the License, or (at your option) any later version.
18 */
19#include <linux/module.h>
20#include <linux/mm.h>
21#include <linux/slab.h>
22#include <linux/earlycpio.h>
23#include <linux/initrd.h>
24#include <linux/cpu.h>
25#include <asm/msr.h>
26#include <asm/microcode_intel.h>
27#include <asm/processor.h>
28#include <asm/tlbflush.h>
29#include <asm/setup.h>
30
31unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
32struct mc_saved_data {
33 unsigned int mc_saved_count;
34 struct microcode_intel **mc_saved;
35} mc_saved_data;
36
37static enum ucode_state __cpuinit
38generic_load_microcode_early(struct microcode_intel **mc_saved_p,
39 unsigned int mc_saved_count,
40 struct ucode_cpu_info *uci)
41{
42 struct microcode_intel *ucode_ptr, *new_mc = NULL;
43 int new_rev = uci->cpu_sig.rev;
44 enum ucode_state state = UCODE_OK;
45 unsigned int mc_size;
46 struct microcode_header_intel *mc_header;
47 unsigned int csig = uci->cpu_sig.sig;
48 unsigned int cpf = uci->cpu_sig.pf;
49 int i;
50
51 for (i = 0; i < mc_saved_count; i++) {
52 ucode_ptr = mc_saved_p[i];
53
54 mc_header = (struct microcode_header_intel *)ucode_ptr;
55 mc_size = get_totalsize(mc_header);
56 if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
57 new_rev = mc_header->rev;
58 new_mc = ucode_ptr;
59 }
60 }
61
62 if (!new_mc) {
63 state = UCODE_NFOUND;
64 goto out;
65 }
66
67 uci->mc = (struct microcode_intel *)new_mc;
68out:
69 return state;
70}
71
72static void __cpuinit
73microcode_pointer(struct microcode_intel **mc_saved,
74 unsigned long *mc_saved_in_initrd,
75 unsigned long initrd_start, int mc_saved_count)
76{
77 int i;
78
79 for (i = 0; i < mc_saved_count; i++)
80 mc_saved[i] = (struct microcode_intel *)
81 (mc_saved_in_initrd[i] + initrd_start);
82}
83
84#ifdef CONFIG_X86_32
85static void __cpuinit
86microcode_phys(struct microcode_intel **mc_saved_tmp,
87 struct mc_saved_data *mc_saved_data)
88{
89 int i;
90 struct microcode_intel ***mc_saved;
91
92 mc_saved = (struct microcode_intel ***)
93 __pa_symbol(&mc_saved_data->mc_saved);
94 for (i = 0; i < mc_saved_data->mc_saved_count; i++) {
95 struct microcode_intel *p;
96
97 p = *(struct microcode_intel **)
98 __pa(mc_saved_data->mc_saved + i);
99 mc_saved_tmp[i] = (struct microcode_intel *)__pa(p);
100 }
101}
102#endif
103
104static enum ucode_state __cpuinit
105load_microcode(struct mc_saved_data *mc_saved_data,
106 unsigned long *mc_saved_in_initrd,
107 unsigned long initrd_start,
108 struct ucode_cpu_info *uci)
109{
110 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
111 unsigned int count = mc_saved_data->mc_saved_count;
112
113 if (!mc_saved_data->mc_saved) {
114 microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
115 initrd_start, count);
116
117 return generic_load_microcode_early(mc_saved_tmp, count, uci);
118 } else {
119#ifdef CONFIG_X86_32
120 microcode_phys(mc_saved_tmp, mc_saved_data);
121 return generic_load_microcode_early(mc_saved_tmp, count, uci);
122#else
123 return generic_load_microcode_early(mc_saved_data->mc_saved,
124 count, uci);
125#endif
126 }
127}
128
129static u8 get_x86_family(unsigned long sig)
130{
131 u8 x86;
132
133 x86 = (sig >> 8) & 0xf;
134
135 if (x86 == 0xf)
136 x86 += (sig >> 20) & 0xff;
137
138 return x86;
139}
140
141static u8 get_x86_model(unsigned long sig)
142{
143 u8 x86, x86_model;
144
145 x86 = get_x86_family(sig);
146 x86_model = (sig >> 4) & 0xf;
147
148 if (x86 == 0x6 || x86 == 0xf)
149 x86_model += ((sig >> 16) & 0xf) << 4;
150
151 return x86_model;
152}
153
154/*
155 * Given CPU signature and a microcode patch, this function finds if the
156 * microcode patch has matching family and model with the CPU.
157 */
158static enum ucode_state
159matching_model_microcode(struct microcode_header_intel *mc_header,
160 unsigned long sig)
161{
162 u8 x86, x86_model;
163 u8 x86_ucode, x86_model_ucode;
164 struct extended_sigtable *ext_header;
165 unsigned long total_size = get_totalsize(mc_header);
166 unsigned long data_size = get_datasize(mc_header);
167 int ext_sigcount, i;
168 struct extended_signature *ext_sig;
169
170 x86 = get_x86_family(sig);
171 x86_model = get_x86_model(sig);
172
173 x86_ucode = get_x86_family(mc_header->sig);
174 x86_model_ucode = get_x86_model(mc_header->sig);
175
176 if (x86 == x86_ucode && x86_model == x86_model_ucode)
177 return UCODE_OK;
178
179 /* Look for ext. headers: */
180 if (total_size <= data_size + MC_HEADER_SIZE)
181 return UCODE_NFOUND;
182
183 ext_header = (struct extended_sigtable *)
184 mc_header + data_size + MC_HEADER_SIZE;
185 ext_sigcount = ext_header->count;
186 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
187
188 for (i = 0; i < ext_sigcount; i++) {
189 x86_ucode = get_x86_family(ext_sig->sig);
190 x86_model_ucode = get_x86_model(ext_sig->sig);
191
192 if (x86 == x86_ucode && x86_model == x86_model_ucode)
193 return UCODE_OK;
194
195 ext_sig++;
196 }
197
198 return UCODE_NFOUND;
199}
200
201static int
202save_microcode(struct mc_saved_data *mc_saved_data,
203 struct microcode_intel **mc_saved_src,
204 unsigned int mc_saved_count)
205{
206 int i, j;
207 struct microcode_intel **mc_saved_p;
208 int ret;
209
210 if (!mc_saved_count)
211 return -EINVAL;
212
213 /*
214 * Copy new microcode data.
215 */
216 mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
217 GFP_KERNEL);
218 if (!mc_saved_p)
219 return -ENOMEM;
220
221 for (i = 0; i < mc_saved_count; i++) {
222 struct microcode_intel *mc = mc_saved_src[i];
223 struct microcode_header_intel *mc_header = &mc->hdr;
224 unsigned long mc_size = get_totalsize(mc_header);
225 mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
226 if (!mc_saved_p[i]) {
227 ret = -ENOMEM;
228 goto err;
229 }
230 if (!mc_saved_src[i]) {
231 ret = -EINVAL;
232 goto err;
233 }
234 memcpy(mc_saved_p[i], mc, mc_size);
235 }
236
237 /*
238 * Point to newly saved microcode.
239 */
240 mc_saved_data->mc_saved = mc_saved_p;
241 mc_saved_data->mc_saved_count = mc_saved_count;
242
243 return 0;
244
245err:
246 for (j = 0; j <= i; j++)
247 kfree(mc_saved_p[j]);
248 kfree(mc_saved_p);
249
250 return ret;
251}
252
253/*
254 * A microcode patch in ucode_ptr is saved into mc_saved
255 * - if it has matching signature and newer revision compared to an existing
256 * patch mc_saved.
257 * - or if it is a newly discovered microcode patch.
258 *
259 * The microcode patch should have matching model with CPU.
260 */
261static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
262 unsigned int *mc_saved_count_p)
263{
264 int i;
265 int found = 0;
266 unsigned int mc_saved_count = *mc_saved_count_p;
267 struct microcode_header_intel *mc_header;
268
269 mc_header = (struct microcode_header_intel *)ucode_ptr;
270 for (i = 0; i < mc_saved_count; i++) {
271 unsigned int sig, pf;
272 unsigned int new_rev;
273 struct microcode_header_intel *mc_saved_header =
274 (struct microcode_header_intel *)mc_saved[i];
275 sig = mc_saved_header->sig;
276 pf = mc_saved_header->pf;
277 new_rev = mc_header->rev;
278
279 if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
280 found = 1;
281 if (update_match_revision(mc_header, new_rev)) {
282 /*
283 * Found an older ucode saved before.
284 * Replace the older one with this newer
285 * one.
286 */
287 mc_saved[i] =
288 (struct microcode_intel *)ucode_ptr;
289 break;
290 }
291 }
292 }
293 if (i >= mc_saved_count && !found)
294 /*
295 * This ucode is first time discovered in ucode file.
296 * Save it to memory.
297 */
298 mc_saved[mc_saved_count++] =
299 (struct microcode_intel *)ucode_ptr;
300
301 *mc_saved_count_p = mc_saved_count;
302}
303
304/*
305 * Get microcode matching with BSP's model. Only CPUs with the same model as
306 * BSP can stay in the platform.
307 */
308static enum ucode_state __init
309get_matching_model_microcode(int cpu, unsigned long start,
310 void *data, size_t size,
311 struct mc_saved_data *mc_saved_data,
312 unsigned long *mc_saved_in_initrd,
313 struct ucode_cpu_info *uci)
314{
315 u8 *ucode_ptr = data;
316 unsigned int leftover = size;
317 enum ucode_state state = UCODE_OK;
318 unsigned int mc_size;
319 struct microcode_header_intel *mc_header;
320 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
321 unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
322 int i;
323
324 while (leftover) {
325 mc_header = (struct microcode_header_intel *)ucode_ptr;
326
327 mc_size = get_totalsize(mc_header);
328 if (!mc_size || mc_size > leftover ||
329 microcode_sanity_check(ucode_ptr, 0) < 0)
330 break;
331
332 leftover -= mc_size;
333
334 /*
335 * Since APs with same family and model as the BSP may boot in
336 * the platform, we need to find and save microcode patches
337 * with the same family and model as the BSP.
338 */
339 if (matching_model_microcode(mc_header, uci->cpu_sig.sig) !=
340 UCODE_OK) {
341 ucode_ptr += mc_size;
342 continue;
343 }
344
345 _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
346
347 ucode_ptr += mc_size;
348 }
349
350 if (leftover) {
351 state = UCODE_ERROR;
352 goto out;
353 }
354
355 if (mc_saved_count == 0) {
356 state = UCODE_NFOUND;
357 goto out;
358 }
359
360 for (i = 0; i < mc_saved_count; i++)
361 mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start;
362
363 mc_saved_data->mc_saved_count = mc_saved_count;
364out:
365 return state;
366}
367
368#define native_rdmsr(msr, val1, val2) \
369do { \
370 u64 __val = native_read_msr((msr)); \
371 (void)((val1) = (u32)__val); \
372 (void)((val2) = (u32)(__val >> 32)); \
373} while (0)
374
375#define native_wrmsr(msr, low, high) \
376 native_write_msr(msr, low, high);
377
378static int __cpuinit collect_cpu_info_early(struct ucode_cpu_info *uci)
379{
380 unsigned int val[2];
381 u8 x86, x86_model;
382 struct cpu_signature csig;
383 unsigned int eax, ebx, ecx, edx;
384
385 csig.sig = 0;
386 csig.pf = 0;
387 csig.rev = 0;
388
389 memset(uci, 0, sizeof(*uci));
390
391 eax = 0x00000001;
392 ecx = 0;
393 native_cpuid(&eax, &ebx, &ecx, &edx);
394 csig.sig = eax;
395
396 x86 = get_x86_family(csig.sig);
397 x86_model = get_x86_model(csig.sig);
398
399 if ((x86_model >= 5) || (x86 > 6)) {
400 /* get processor flags from MSR 0x17 */
401 native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
402 csig.pf = 1 << ((val[1] >> 18) & 7);
403 }
404 native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
405
406 /* As documented in the SDM: Do a CPUID 1 here */
407 sync_core();
408
409 /* get the current revision from MSR 0x8B */
410 native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
411
412 csig.rev = val[1];
413
414 uci->cpu_sig = csig;
415 uci->valid = 1;
416
417 return 0;
418}
419
420#ifdef DEBUG
421static void __ref show_saved_mc(void)
422{
423 int i, j;
424 unsigned int sig, pf, rev, total_size, data_size, date;
425 struct ucode_cpu_info uci;
426
427 if (mc_saved_data.mc_saved_count == 0) {
428 pr_debug("no micorcode data saved.\n");
429 return;
430 }
431 pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
432
433 collect_cpu_info_early(&uci);
434
435 sig = uci.cpu_sig.sig;
436 pf = uci.cpu_sig.pf;
437 rev = uci.cpu_sig.rev;
438 pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
439 smp_processor_id(), sig, pf, rev);
440
441 for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
442 struct microcode_header_intel *mc_saved_header;
443 struct extended_sigtable *ext_header;
444 int ext_sigcount;
445 struct extended_signature *ext_sig;
446
447 mc_saved_header = (struct microcode_header_intel *)
448 mc_saved_data.mc_saved[i];
449 sig = mc_saved_header->sig;
450 pf = mc_saved_header->pf;
451 rev = mc_saved_header->rev;
452 total_size = get_totalsize(mc_saved_header);
453 data_size = get_datasize(mc_saved_header);
454 date = mc_saved_header->date;
455
456 pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n",
457 i, sig, pf, rev, total_size,
458 date & 0xffff,
459 date >> 24,
460 (date >> 16) & 0xff);
461
462 /* Look for ext. headers: */
463 if (total_size <= data_size + MC_HEADER_SIZE)
464 continue;
465
466 ext_header = (struct extended_sigtable *)
467 mc_saved_header + data_size + MC_HEADER_SIZE;
468 ext_sigcount = ext_header->count;
469 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
470
471 for (j = 0; j < ext_sigcount; j++) {
472 sig = ext_sig->sig;
473 pf = ext_sig->pf;
474
475 pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n",
476 j, sig, pf);
477
478 ext_sig++;
479 }
480
481 }
482}
483#else
484static inline void show_saved_mc(void)
485{
486}
487#endif
488
489#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU)
490/*
491 * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
492 * hot added or resumes.
493 *
494 * Please make sure this mc should be a valid microcode patch before calling
495 * this function.
496 */
497int save_mc_for_early(u8 *mc)
498{
499 struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
500 unsigned int mc_saved_count_init;
501 unsigned int mc_saved_count;
502 struct microcode_intel **mc_saved;
503 int ret = 0;
504 int i;
505
506 /*
507 * Hold hotplug lock so mc_saved_data is not accessed by a CPU in
508 * hotplug.
509 */
510 cpu_hotplug_driver_lock();
511
512 mc_saved_count_init = mc_saved_data.mc_saved_count;
513 mc_saved_count = mc_saved_data.mc_saved_count;
514 mc_saved = mc_saved_data.mc_saved;
515
516 if (mc_saved && mc_saved_count)
517 memcpy(mc_saved_tmp, mc_saved,
518 mc_saved_count * sizeof(struct mirocode_intel *));
519 /*
520 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
521 * version.
522 */
523
524 _save_mc(mc_saved_tmp, mc, &mc_saved_count);
525
526 /*
527 * Save the mc_save_tmp in global mc_saved_data.
528 */
529 ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count);
530 if (ret) {
531 pr_err("Can not save microcode patch.\n");
532 goto out;
533 }
534
535 show_saved_mc();
536
537 /*
538 * Free old saved microcod data.
539 */
540 if (mc_saved) {
541 for (i = 0; i < mc_saved_count_init; i++)
542 kfree(mc_saved[i]);
543 kfree(mc_saved);
544 }
545
546out:
547 cpu_hotplug_driver_unlock();
548
549 return ret;
550}
551EXPORT_SYMBOL_GPL(save_mc_for_early);
552#endif
553
554static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
555static __init enum ucode_state
556scan_microcode(unsigned long start, unsigned long end,
557 struct mc_saved_data *mc_saved_data,
558 unsigned long *mc_saved_in_initrd,
559 struct ucode_cpu_info *uci)
560{
561 unsigned int size = end - start + 1;
562 struct cpio_data cd;
563 long offset = 0;
564#ifdef CONFIG_X86_32
565 char *p = (char *)__pa_symbol(ucode_name);
566#else
567 char *p = ucode_name;
568#endif
569
570 cd.data = NULL;
571 cd.size = 0;
572
573 cd = find_cpio_data(p, (void *)start, size, &offset);
574 if (!cd.data)
575 return UCODE_ERROR;
576
577
578 return get_matching_model_microcode(0, start, cd.data, cd.size,
579 mc_saved_data, mc_saved_in_initrd,
580 uci);
581}
582
583/*
584 * Print ucode update info.
585 */
586static void __cpuinit
587print_ucode_info(struct ucode_cpu_info *uci, unsigned int date)
588{
589 int cpu = smp_processor_id();
590
591 pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n",
592 cpu,
593 uci->cpu_sig.rev,
594 date & 0xffff,
595 date >> 24,
596 (date >> 16) & 0xff);
597}
598
599#ifdef CONFIG_X86_32
600
601static int delay_ucode_info;
602static int current_mc_date;
603
604/*
605 * Print early updated ucode info after printk works. This is delayed info dump.
606 */
607void __cpuinit show_ucode_info_early(void)
608{
609 struct ucode_cpu_info uci;
610
611 if (delay_ucode_info) {
612 collect_cpu_info_early(&uci);
613 print_ucode_info(&uci, current_mc_date);
614 delay_ucode_info = 0;
615 }
616}
617
618/*
619 * At this point, we can not call printk() yet. Keep microcode patch number in
620 * mc_saved_data.mc_saved and delay printing microcode info in
621 * show_ucode_info_early() until printk() works.
622 */
623static void __cpuinit print_ucode(struct ucode_cpu_info *uci)
624{
625 struct microcode_intel *mc_intel;
626 int *delay_ucode_info_p;
627 int *current_mc_date_p;
628
629 mc_intel = uci->mc;
630 if (mc_intel == NULL)
631 return;
632
633 delay_ucode_info_p = (int *)__pa_symbol(&delay_ucode_info);
634 current_mc_date_p = (int *)__pa_symbol(&current_mc_date);
635
636 *delay_ucode_info_p = 1;
637 *current_mc_date_p = mc_intel->hdr.date;
638}
639#else
640
641/*
642 * Flush global tlb. We only do this in x86_64 where paging has been enabled
643 * already and PGE should be enabled as well.
644 */
645static inline void __cpuinit flush_tlb_early(void)
646{
647 __native_flush_tlb_global_irq_disabled();
648}
649
650static inline void __cpuinit print_ucode(struct ucode_cpu_info *uci)
651{
652 struct microcode_intel *mc_intel;
653
654 mc_intel = uci->mc;
655 if (mc_intel == NULL)
656 return;
657
658 print_ucode_info(uci, mc_intel->hdr.date);
659}
660#endif
661
662static int apply_microcode_early(struct mc_saved_data *mc_saved_data,
663 struct ucode_cpu_info *uci)
664{
665 struct microcode_intel *mc_intel;
666 unsigned int val[2];
667
668 mc_intel = uci->mc;
669 if (mc_intel == NULL)
670 return 0;
671
672 /* write microcode via MSR 0x79 */
673 native_wrmsr(MSR_IA32_UCODE_WRITE,
674 (unsigned long) mc_intel->bits,
675 (unsigned long) mc_intel->bits >> 16 >> 16);
676 native_wrmsr(MSR_IA32_UCODE_REV, 0, 0);
677
678 /* As documented in the SDM: Do a CPUID 1 here */
679 sync_core();
680
681 /* get the current revision from MSR 0x8B */
682 native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
683 if (val[1] != mc_intel->hdr.rev)
684 return -1;
685
686#ifdef CONFIG_X86_64
687 /* Flush global tlb. This is precaution. */
688 flush_tlb_early();
689#endif
690 uci->cpu_sig.rev = val[1];
691
692 print_ucode(uci);
693
694 return 0;
695}
696
697/*
698 * This function converts microcode patch offsets previously stored in
699 * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data.
700 */
701int __init save_microcode_in_initrd(void)
702{
703 unsigned int count = mc_saved_data.mc_saved_count;
704 struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
705 int ret = 0;
706
707 if (count == 0)
708 return ret;
709
710 microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
711 ret = save_microcode(&mc_saved_data, mc_saved, count);
712 if (ret)
713 pr_err("Can not save microcod patches from initrd");
714
715 show_saved_mc();
716
717 return ret;
718}
719
720static void __init
721_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
722 unsigned long *mc_saved_in_initrd,
723 unsigned long initrd_start_early,
724 unsigned long initrd_end_early,
725 struct ucode_cpu_info *uci)
726{
727 collect_cpu_info_early(uci);
728 scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
729 mc_saved_in_initrd, uci);
730 load_microcode(mc_saved_data, mc_saved_in_initrd,
731 initrd_start_early, uci);
732 apply_microcode_early(mc_saved_data, uci);
733}
734
735void __init
736load_ucode_intel_bsp(void)
737{
738 u64 ramdisk_image, ramdisk_size;
739 unsigned long initrd_start_early, initrd_end_early;
740 struct ucode_cpu_info uci;
741#ifdef CONFIG_X86_32
742 struct boot_params *boot_params_p;
743
744 boot_params_p = (struct boot_params *)__pa_symbol(&boot_params);
745 ramdisk_image = boot_params_p->hdr.ramdisk_image;
746 ramdisk_size = boot_params_p->hdr.ramdisk_size;
747 initrd_start_early = ramdisk_image;
748 initrd_end_early = initrd_start_early + ramdisk_size;
749
750 _load_ucode_intel_bsp(
751 (struct mc_saved_data *)__pa_symbol(&mc_saved_data),
752 (unsigned long *)__pa_symbol(&mc_saved_in_initrd),
753 initrd_start_early, initrd_end_early, &uci);
754#else
755 ramdisk_image = boot_params.hdr.ramdisk_image;
756 ramdisk_size = boot_params.hdr.ramdisk_size;
757 initrd_start_early = ramdisk_image + PAGE_OFFSET;
758 initrd_end_early = initrd_start_early + ramdisk_size;
759
760 _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
761 initrd_start_early, initrd_end_early, &uci);
762#endif
763}
764
765void __cpuinit load_ucode_intel_ap(void)
766{
767 struct mc_saved_data *mc_saved_data_p;
768 struct ucode_cpu_info uci;
769 unsigned long *mc_saved_in_initrd_p;
770 unsigned long initrd_start_addr;
771#ifdef CONFIG_X86_32
772 unsigned long *initrd_start_p;
773
774 mc_saved_in_initrd_p =
775 (unsigned long *)__pa_symbol(mc_saved_in_initrd);
776 mc_saved_data_p = (struct mc_saved_data *)__pa_symbol(&mc_saved_data);
777 initrd_start_p = (unsigned long *)__pa_symbol(&initrd_start);
778 initrd_start_addr = (unsigned long)__pa_symbol(*initrd_start_p);
779#else
780 mc_saved_data_p = &mc_saved_data;
781 mc_saved_in_initrd_p = mc_saved_in_initrd;
782 initrd_start_addr = initrd_start;
783#endif
784
785 /*
786 * If there is no valid ucode previously saved in memory, no need to
787 * update ucode on this AP.
788 */
789 if (mc_saved_data_p->mc_saved_count == 0)
790 return;
791
792 collect_cpu_info_early(&uci);
793 load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
794 initrd_start_addr, &uci);
795 apply_microcode_early(mc_saved_data_p, &uci);
796}
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/microcode_intel_lib.c
new file mode 100644
index 000000000000..ce69320d0179
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel_lib.c
@@ -0,0 +1,174 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
5 * H Peter Anvin" <hpa@zytor.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/Assets/PDF/manual/253668.pdf
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 */
25#include <linux/firmware.h>
26#include <linux/uaccess.h>
27#include <linux/kernel.h>
28#include <linux/module.h>
29
30#include <asm/microcode_intel.h>
31#include <asm/processor.h>
32#include <asm/msr.h>
33
34static inline int
35update_match_cpu(unsigned int csig, unsigned int cpf,
36 unsigned int sig, unsigned int pf)
37{
38 return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
39}
40
41int
42update_match_revision(struct microcode_header_intel *mc_header, int rev)
43{
44 return (mc_header->rev <= rev) ? 0 : 1;
45}
46
47int microcode_sanity_check(void *mc, int print_err)
48{
49 unsigned long total_size, data_size, ext_table_size;
50 struct microcode_header_intel *mc_header = mc;
51 struct extended_sigtable *ext_header = NULL;
52 int sum, orig_sum, ext_sigcount = 0, i;
53 struct extended_signature *ext_sig;
54
55 total_size = get_totalsize(mc_header);
56 data_size = get_datasize(mc_header);
57
58 if (data_size + MC_HEADER_SIZE > total_size) {
59 if (print_err)
60 pr_err("error! Bad data size in microcode data file\n");
61 return -EINVAL;
62 }
63
64 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
65 if (print_err)
66 pr_err("error! Unknown microcode update format\n");
67 return -EINVAL;
68 }
69 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
70 if (ext_table_size) {
71 if ((ext_table_size < EXT_HEADER_SIZE)
72 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
73 if (print_err)
74 pr_err("error! Small exttable size in microcode data file\n");
75 return -EINVAL;
76 }
77 ext_header = mc + MC_HEADER_SIZE + data_size;
78 if (ext_table_size != exttable_size(ext_header)) {
79 if (print_err)
80 pr_err("error! Bad exttable size in microcode data file\n");
81 return -EFAULT;
82 }
83 ext_sigcount = ext_header->count;
84 }
85
86 /* check extended table checksum */
87 if (ext_table_size) {
88 int ext_table_sum = 0;
89 int *ext_tablep = (int *)ext_header;
90
91 i = ext_table_size / DWSIZE;
92 while (i--)
93 ext_table_sum += ext_tablep[i];
94 if (ext_table_sum) {
95 if (print_err)
96 pr_warn("aborting, bad extended signature table checksum\n");
97 return -EINVAL;
98 }
99 }
100
101 /* calculate the checksum */
102 orig_sum = 0;
103 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
104 while (i--)
105 orig_sum += ((int *)mc)[i];
106 if (orig_sum) {
107 if (print_err)
108 pr_err("aborting, bad checksum\n");
109 return -EINVAL;
110 }
111 if (!ext_table_size)
112 return 0;
113 /* check extended signature checksum */
114 for (i = 0; i < ext_sigcount; i++) {
115 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
116 EXT_SIGNATURE_SIZE * i;
117 sum = orig_sum
118 - (mc_header->sig + mc_header->pf + mc_header->cksum)
119 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
120 if (sum) {
121 if (print_err)
122 pr_err("aborting, bad checksum\n");
123 return -EINVAL;
124 }
125 }
126 return 0;
127}
128EXPORT_SYMBOL_GPL(microcode_sanity_check);
129
130/*
131 * return 0 - no update found
132 * return 1 - found update
133 */
134int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
135{
136 struct microcode_header_intel *mc_header = mc;
137 struct extended_sigtable *ext_header;
138 unsigned long total_size = get_totalsize(mc_header);
139 int ext_sigcount, i;
140 struct extended_signature *ext_sig;
141
142 if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
143 return 1;
144
145 /* Look for ext. headers: */
146 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
147 return 0;
148
149 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
150 ext_sigcount = ext_header->count;
151 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
152
153 for (i = 0; i < ext_sigcount; i++) {
154 if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
155 return 1;
156 ext_sig++;
157 }
158 return 0;
159}
160
161/*
162 * return 0 - no update found
163 * return 1 - found update
164 */
165int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
166{
167 struct microcode_header_intel *mc_header = mc;
168
169 if (!update_match_revision(mc_header, rev))
170 return 0;
171
172 return get_matching_sig(csig, cpf, mc, rev);
173}
174EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a7c5661f8496..ce130493b802 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -71,7 +71,7 @@ static ssize_t msr_read(struct file *file, char __user *buf,
71 u32 __user *tmp = (u32 __user *) buf; 71 u32 __user *tmp = (u32 __user *) buf;
72 u32 data[2]; 72 u32 data[2];
73 u32 reg = *ppos; 73 u32 reg = *ppos;
74 int cpu = iminor(file->f_path.dentry->d_inode); 74 int cpu = iminor(file_inode(file));
75 int err = 0; 75 int err = 0;
76 ssize_t bytes = 0; 76 ssize_t bytes = 0;
77 77
@@ -99,7 +99,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
99 const u32 __user *tmp = (const u32 __user *)buf; 99 const u32 __user *tmp = (const u32 __user *)buf;
100 u32 data[2]; 100 u32 data[2];
101 u32 reg = *ppos; 101 u32 reg = *ppos;
102 int cpu = iminor(file->f_path.dentry->d_inode); 102 int cpu = iminor(file_inode(file));
103 int err = 0; 103 int err = 0;
104 ssize_t bytes = 0; 104 ssize_t bytes = 0;
105 105
@@ -125,7 +125,7 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
125{ 125{
126 u32 __user *uregs = (u32 __user *)arg; 126 u32 __user *uregs = (u32 __user *)arg;
127 u32 regs[8]; 127 u32 regs[8];
128 int cpu = iminor(file->f_path.dentry->d_inode); 128 int cpu = iminor(file_inode(file));
129 int err; 129 int err;
130 130
131 switch (ioc) { 131 switch (ioc) {
@@ -171,10 +171,12 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
171 171
172static int msr_open(struct inode *inode, struct file *file) 172static int msr_open(struct inode *inode, struct file *file)
173{ 173{
174 unsigned int cpu; 174 unsigned int cpu = iminor(file_inode(file));
175 struct cpuinfo_x86 *c; 175 struct cpuinfo_x86 *c;
176 176
177 cpu = iminor(file->f_path.dentry->d_inode); 177 if (!capable(CAP_SYS_RAWIO))
178 return -EPERM;
179
178 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 180 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
179 return -ENXIO; /* No such CPU */ 181 return -ENXIO; /* No such CPU */
180 182
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index f84f5c57de35..60308053fdb2 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -509,3 +509,4 @@ void local_touch_nmi(void)
509{ 509{
510 __this_cpu_write(last_nmi_rip, 0); 510 __this_cpu_write(last_nmi_rip, 0);
511} 511}
512EXPORT_SYMBOL_GPL(local_touch_nmi);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index de2b7ad70273..872079a67e4d 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -56,7 +56,7 @@ struct device x86_dma_fallback_dev = {
56EXPORT_SYMBOL(x86_dma_fallback_dev); 56EXPORT_SYMBOL(x86_dma_fallback_dev);
57 57
58/* Number of entries preallocated for DMA-API debugging */ 58/* Number of entries preallocated for DMA-API debugging */
59#define PREALLOC_DMA_DEBUG_ENTRIES 32768 59#define PREALLOC_DMA_DEBUG_ENTRIES 65536
60 60
61int dma_set_mask(struct device *dev, u64 mask) 61int dma_set_mask(struct device *dev, u64 mask)
62{ 62{
@@ -265,7 +265,7 @@ rootfs_initcall(pci_iommu_init);
265#ifdef CONFIG_PCI 265#ifdef CONFIG_PCI
266/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 266/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
267 267
268static __devinit void via_no_dac(struct pci_dev *dev) 268static void via_no_dac(struct pci_dev *dev)
269{ 269{
270 if (forbid_dac == 0) { 270 if (forbid_dac == 0) {
271 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); 271 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2ed787f15bf0..14ae10031ff0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -268,13 +268,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
268unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 268unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
269EXPORT_SYMBOL(boot_option_idle_override); 269EXPORT_SYMBOL(boot_option_idle_override);
270 270
271/* 271static void (*x86_idle)(void);
272 * Powermanagement idle function, if any..
273 */
274void (*pm_idle)(void);
275#ifdef CONFIG_APM_MODULE
276EXPORT_SYMBOL(pm_idle);
277#endif
278 272
279#ifndef CONFIG_SMP 273#ifndef CONFIG_SMP
280static inline void play_dead(void) 274static inline void play_dead(void)
@@ -351,7 +345,7 @@ void cpu_idle(void)
351 rcu_idle_enter(); 345 rcu_idle_enter();
352 346
353 if (cpuidle_idle_call()) 347 if (cpuidle_idle_call())
354 pm_idle(); 348 x86_idle();
355 349
356 rcu_idle_exit(); 350 rcu_idle_exit();
357 start_critical_timings(); 351 start_critical_timings();
@@ -375,7 +369,6 @@ void cpu_idle(void)
375 */ 369 */
376void default_idle(void) 370void default_idle(void)
377{ 371{
378 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
379 trace_cpu_idle_rcuidle(1, smp_processor_id()); 372 trace_cpu_idle_rcuidle(1, smp_processor_id());
380 current_thread_info()->status &= ~TS_POLLING; 373 current_thread_info()->status &= ~TS_POLLING;
381 /* 374 /*
@@ -389,21 +382,22 @@ void default_idle(void)
389 else 382 else
390 local_irq_enable(); 383 local_irq_enable();
391 current_thread_info()->status |= TS_POLLING; 384 current_thread_info()->status |= TS_POLLING;
392 trace_power_end_rcuidle(smp_processor_id());
393 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 385 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
394} 386}
395#ifdef CONFIG_APM_MODULE 387#ifdef CONFIG_APM_MODULE
396EXPORT_SYMBOL(default_idle); 388EXPORT_SYMBOL(default_idle);
397#endif 389#endif
398 390
399bool set_pm_idle_to_default(void) 391#ifdef CONFIG_XEN
392bool xen_set_default_idle(void)
400{ 393{
401 bool ret = !!pm_idle; 394 bool ret = !!x86_idle;
402 395
403 pm_idle = default_idle; 396 x86_idle = default_idle;
404 397
405 return ret; 398 return ret;
406} 399}
400#endif
407void stop_this_cpu(void *dummy) 401void stop_this_cpu(void *dummy)
408{ 402{
409 local_irq_disable(); 403 local_irq_disable();
@@ -413,31 +407,8 @@ void stop_this_cpu(void *dummy)
413 set_cpu_online(smp_processor_id(), false); 407 set_cpu_online(smp_processor_id(), false);
414 disable_local_APIC(); 408 disable_local_APIC();
415 409
416 for (;;) { 410 for (;;)
417 if (hlt_works(smp_processor_id())) 411 halt();
418 halt();
419 }
420}
421
422/* Default MONITOR/MWAIT with no hints, used for default C1 state */
423static void mwait_idle(void)
424{
425 if (!need_resched()) {
426 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
427 trace_cpu_idle_rcuidle(1, smp_processor_id());
428 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
429 clflush((void *)&current_thread_info()->flags);
430
431 __monitor((void *)&current_thread_info()->flags, 0, 0);
432 smp_mb();
433 if (!need_resched())
434 __sti_mwait(0, 0);
435 else
436 local_irq_enable();
437 trace_power_end_rcuidle(smp_processor_id());
438 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
439 } else
440 local_irq_enable();
441} 412}
442 413
443/* 414/*
@@ -447,62 +418,13 @@ static void mwait_idle(void)
447 */ 418 */
448static void poll_idle(void) 419static void poll_idle(void)
449{ 420{
450 trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id());
451 trace_cpu_idle_rcuidle(0, smp_processor_id()); 421 trace_cpu_idle_rcuidle(0, smp_processor_id());
452 local_irq_enable(); 422 local_irq_enable();
453 while (!need_resched()) 423 while (!need_resched())
454 cpu_relax(); 424 cpu_relax();
455 trace_power_end_rcuidle(smp_processor_id());
456 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 425 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
457} 426}
458 427
459/*
460 * mwait selection logic:
461 *
462 * It depends on the CPU. For AMD CPUs that support MWAIT this is
463 * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings
464 * then depend on a clock divisor and current Pstate of the core. If
465 * all cores of a processor are in halt state (C1) the processor can
466 * enter the C1E (C1 enhanced) state. If mwait is used this will never
467 * happen.
468 *
469 * idle=mwait overrides this decision and forces the usage of mwait.
470 */
471
472#define MWAIT_INFO 0x05
473#define MWAIT_ECX_EXTENDED_INFO 0x01
474#define MWAIT_EDX_C1 0xf0
475
476int mwait_usable(const struct cpuinfo_x86 *c)
477{
478 u32 eax, ebx, ecx, edx;
479
480 /* Use mwait if idle=mwait boot option is given */
481 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
482 return 1;
483
484 /*
485 * Any idle= boot option other than idle=mwait means that we must not
486 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
487 */
488 if (boot_option_idle_override != IDLE_NO_OVERRIDE)
489 return 0;
490
491 if (c->cpuid_level < MWAIT_INFO)
492 return 0;
493
494 cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
495 /* Check, whether EDX has extended info about MWAIT */
496 if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
497 return 1;
498
499 /*
500 * edx enumeratios MONITOR/MWAIT extensions. Check, whether
501 * C1 supports MWAIT
502 */
503 return (edx & MWAIT_EDX_C1);
504}
505
506bool amd_e400_c1e_detected; 428bool amd_e400_c1e_detected;
507EXPORT_SYMBOL(amd_e400_c1e_detected); 429EXPORT_SYMBOL(amd_e400_c1e_detected);
508 430
@@ -567,31 +489,24 @@ static void amd_e400_idle(void)
567void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 489void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
568{ 490{
569#ifdef CONFIG_SMP 491#ifdef CONFIG_SMP
570 if (pm_idle == poll_idle && smp_num_siblings > 1) { 492 if (x86_idle == poll_idle && smp_num_siblings > 1)
571 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 493 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
572 }
573#endif 494#endif
574 if (pm_idle) 495 if (x86_idle)
575 return; 496 return;
576 497
577 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { 498 if (cpu_has_amd_erratum(amd_erratum_400)) {
578 /*
579 * One CPU supports mwait => All CPUs supports mwait
580 */
581 pr_info("using mwait in idle threads\n");
582 pm_idle = mwait_idle;
583 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
584 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 499 /* E400: APIC timer interrupt does not wake up CPU from C1e */
585 pr_info("using AMD E400 aware idle routine\n"); 500 pr_info("using AMD E400 aware idle routine\n");
586 pm_idle = amd_e400_idle; 501 x86_idle = amd_e400_idle;
587 } else 502 } else
588 pm_idle = default_idle; 503 x86_idle = default_idle;
589} 504}
590 505
591void __init init_amd_e400_c1e_mask(void) 506void __init init_amd_e400_c1e_mask(void)
592{ 507{
593 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 508 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
594 if (pm_idle == amd_e400_idle) 509 if (x86_idle == amd_e400_idle)
595 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 510 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
596} 511}
597 512
@@ -602,11 +517,8 @@ static int __init idle_setup(char *str)
602 517
603 if (!strcmp(str, "poll")) { 518 if (!strcmp(str, "poll")) {
604 pr_info("using polling idle threads\n"); 519 pr_info("using polling idle threads\n");
605 pm_idle = poll_idle; 520 x86_idle = poll_idle;
606 boot_option_idle_override = IDLE_POLL; 521 boot_option_idle_override = IDLE_POLL;
607 } else if (!strcmp(str, "mwait")) {
608 boot_option_idle_override = IDLE_FORCE_MWAIT;
609 WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
610 } else if (!strcmp(str, "halt")) { 522 } else if (!strcmp(str, "halt")) {
611 /* 523 /*
612 * When the boot option of idle=halt is added, halt is 524 * When the boot option of idle=halt is added, halt is
@@ -615,7 +527,7 @@ static int __init idle_setup(char *str)
615 * To continue to load the CPU idle driver, don't touch 527 * To continue to load the CPU idle driver, don't touch
616 * the boot_option_idle_override. 528 * the boot_option_idle_override.
617 */ 529 */
618 pm_idle = default_idle; 530 x86_idle = default_idle;
619 boot_option_idle_override = IDLE_HALT; 531 boot_option_idle_override = IDLE_HALT;
620 } else if (!strcmp(str, "nomwait")) { 532 } else if (!strcmp(str, "nomwait")) {
621 /* 533 /*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6e68a6194965..0f49677da51e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -117,7 +117,7 @@ void release_thread(struct task_struct *dead_task)
117{ 117{
118 if (dead_task->mm) { 118 if (dead_task->mm) {
119 if (dead_task->mm->context.size) { 119 if (dead_task->mm->context.size) {
120 pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", 120 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
121 dead_task->comm, 121 dead_task->comm,
122 dead_task->mm->context.ldt, 122 dead_task->mm->context.ldt,
123 dead_task->mm->context.size); 123 dead_task->mm->context.size);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b629bbe0d9bd..29a8120e6fe8 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,7 +22,7 @@
22#include <linux/perf_event.h> 22#include <linux/perf_event.h>
23#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
24#include <linux/rcupdate.h> 24#include <linux/rcupdate.h>
25#include <linux/module.h> 25#include <linux/export.h>
26#include <linux/context_tracking.h> 26#include <linux/context_tracking.h>
27 27
28#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 85c39590c1a4..2cb9470ea85b 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -185,7 +185,7 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
185 185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa_symbol(i) + (idx*PAGE_SIZE), 188 __pa(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR); 189 PAGE_KERNEL_VVAR);
190 } 190 }
191 191
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 1b27de563561..26ee48a33dc4 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -8,7 +8,7 @@
8 8
9#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 9#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
10 10
11static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) 11static void quirk_intel_irqbalance(struct pci_dev *dev)
12{ 12{
13 u8 config; 13 u8 config;
14 u16 word; 14 u16 word;
@@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
512 512
513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
514/* Set correct numa_node information for AMD NB functions */ 514/* Set correct numa_node information for AMD NB functions */
515static void __devinit quirk_amd_nb_node(struct pci_dev *dev) 515static void quirk_amd_nb_node(struct pci_dev *dev)
516{ 516{
517 struct pci_dev *nb_ht; 517 struct pci_dev *nb_ht;
518 unsigned int devfn; 518 unsigned int devfn;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 4e8ba39eaf0f..76fa1e9a2b39 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -584,7 +584,7 @@ static void native_machine_emergency_restart(void)
584 break; 584 break;
585 585
586 case BOOT_EFI: 586 case BOOT_EFI:
587 if (efi_enabled) 587 if (efi_enabled(EFI_RUNTIME_SERVICES))
588 efi.reset_system(reboot_mode ? 588 efi.reset_system(reboot_mode ?
589 EFI_RESET_WARM : 589 EFI_RESET_WARM :
590 EFI_RESET_COLD, 590 EFI_RESET_COLD,
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 801602b5d745..2e8f3d3b5641 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -149,7 +149,6 @@ unsigned long mach_get_cmos_time(void)
149 if (century) { 149 if (century) {
150 century = bcd2bin(century); 150 century = bcd2bin(century);
151 year += century * 100; 151 year += century * 100;
152 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
153 } else 152 } else
154 year += CMOS_YEARS_OFFS; 153 year += CMOS_YEARS_OFFS;
155 154
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c228322ca180..84d32855f65c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,17 +108,16 @@
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/amd_nb.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h>
113#endif
114#include <asm/mce.h> 111#include <asm/mce.h>
115#include <asm/alternative.h> 112#include <asm/alternative.h>
116#include <asm/prom.h> 113#include <asm/prom.h>
117 114
118/* 115/*
119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * max_low_pfn_mapped: highest direct mapped pfn under 4GB
120 * The direct mapping extends to max_pfn_mapped, so that we can directly access 117 * max_pfn_mapped: highest direct mapped pfn over 4GB
121 * apertures, ACPI and other tables without having to play with fixmaps. 118 *
119 * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
120 * represented by pfn_mapped
122 */ 121 */
123unsigned long max_low_pfn_mapped; 122unsigned long max_low_pfn_mapped;
124unsigned long max_pfn_mapped; 123unsigned long max_pfn_mapped;
@@ -276,18 +275,7 @@ void * __init extend_brk(size_t size, size_t align)
276 return ret; 275 return ret;
277} 276}
278 277
279#ifdef CONFIG_X86_64 278#ifdef CONFIG_X86_32
280static void __init init_gbpages(void)
281{
282 if (direct_gbpages && cpu_has_gbpages)
283 printk(KERN_INFO "Using GB pages for direct mapping\n");
284 else
285 direct_gbpages = 0;
286}
287#else
288static inline void init_gbpages(void)
289{
290}
291static void __init cleanup_highmap(void) 279static void __init cleanup_highmap(void)
292{ 280{
293} 281}
@@ -296,8 +284,8 @@ static void __init cleanup_highmap(void)
296static void __init reserve_brk(void) 284static void __init reserve_brk(void)
297{ 285{
298 if (_brk_end > _brk_start) 286 if (_brk_end > _brk_start)
299 memblock_reserve(__pa(_brk_start), 287 memblock_reserve(__pa_symbol(_brk_start),
300 __pa(_brk_end) - __pa(_brk_start)); 288 _brk_end - _brk_start);
301 289
302 /* Mark brk area as locked down and no longer taking any 290 /* Mark brk area as locked down and no longer taking any
303 new allocations */ 291 new allocations */
@@ -306,27 +294,43 @@ static void __init reserve_brk(void)
306 294
307#ifdef CONFIG_BLK_DEV_INITRD 295#ifdef CONFIG_BLK_DEV_INITRD
308 296
297static u64 __init get_ramdisk_image(void)
298{
299 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
300
301 ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
302
303 return ramdisk_image;
304}
305static u64 __init get_ramdisk_size(void)
306{
307 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
308
309 ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
310
311 return ramdisk_size;
312}
313
309#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 314#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
310static void __init relocate_initrd(void) 315static void __init relocate_initrd(void)
311{ 316{
312 /* Assume only end is not page aligned */ 317 /* Assume only end is not page aligned */
313 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 318 u64 ramdisk_image = get_ramdisk_image();
314 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 319 u64 ramdisk_size = get_ramdisk_size();
315 u64 area_size = PAGE_ALIGN(ramdisk_size); 320 u64 area_size = PAGE_ALIGN(ramdisk_size);
316 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
317 u64 ramdisk_here; 321 u64 ramdisk_here;
318 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
319 char *p, *q; 323 char *p, *q;
320 324
321 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into directly mapped mem */
322 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 326 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
323 PAGE_SIZE); 327 area_size, PAGE_SIZE);
324 328
325 if (!ramdisk_here) 329 if (!ramdisk_here)
326 panic("Cannot find place for new RAMDISK of size %lld\n", 330 panic("Cannot find place for new RAMDISK of size %lld\n",
327 ramdisk_size); 331 ramdisk_size);
328 332
329 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the mem currently occupied by
330 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
331 memblock_reserve(ramdisk_here, area_size); 335 memblock_reserve(ramdisk_here, area_size);
332 initrd_start = ramdisk_here + PAGE_OFFSET; 336 initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -336,17 +340,7 @@ static void __init relocate_initrd(void)
336 340
337 q = (char *)initrd_start; 341 q = (char *)initrd_start;
338 342
339 /* Copy any lowmem portion of the initrd */ 343 /* Copy the initrd */
340 if (ramdisk_image < end_of_lowmem) {
341 clen = end_of_lowmem - ramdisk_image;
342 p = (char *)__va(ramdisk_image);
343 memcpy(q, p, clen);
344 q += clen;
345 ramdisk_image += clen;
346 ramdisk_size -= clen;
347 }
348
349 /* Copy the highmem portion of the initrd */
350 while (ramdisk_size) { 344 while (ramdisk_size) {
351 slop = ramdisk_image & ~PAGE_MASK; 345 slop = ramdisk_image & ~PAGE_MASK;
352 clen = ramdisk_size; 346 clen = ramdisk_size;
@@ -360,22 +354,35 @@ static void __init relocate_initrd(void)
360 ramdisk_image += clen; 354 ramdisk_image += clen;
361 ramdisk_size -= clen; 355 ramdisk_size -= clen;
362 } 356 }
363 /* high pages is not converted by early_res_to_bootmem */ 357
364 ramdisk_image = boot_params.hdr.ramdisk_image; 358 ramdisk_image = get_ramdisk_image();
365 ramdisk_size = boot_params.hdr.ramdisk_size; 359 ramdisk_size = get_ramdisk_size();
366 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 360 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
367 " [mem %#010llx-%#010llx]\n", 361 " [mem %#010llx-%#010llx]\n",
368 ramdisk_image, ramdisk_image + ramdisk_size - 1, 362 ramdisk_image, ramdisk_image + ramdisk_size - 1,
369 ramdisk_here, ramdisk_here + ramdisk_size - 1); 363 ramdisk_here, ramdisk_here + ramdisk_size - 1);
370} 364}
371 365
366static void __init early_reserve_initrd(void)
367{
368 /* Assume only end is not page aligned */
369 u64 ramdisk_image = get_ramdisk_image();
370 u64 ramdisk_size = get_ramdisk_size();
371 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
372
373 if (!boot_params.hdr.type_of_loader ||
374 !ramdisk_image || !ramdisk_size)
375 return; /* No initrd provided by bootloader */
376
377 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
378}
372static void __init reserve_initrd(void) 379static void __init reserve_initrd(void)
373{ 380{
374 /* Assume only end is not page aligned */ 381 /* Assume only end is not page aligned */
375 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 382 u64 ramdisk_image = get_ramdisk_image();
376 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 383 u64 ramdisk_size = get_ramdisk_size();
377 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 384 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
378 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 385 u64 mapped_size;
379 386
380 if (!boot_params.hdr.type_of_loader || 387 if (!boot_params.hdr.type_of_loader ||
381 !ramdisk_image || !ramdisk_size) 388 !ramdisk_image || !ramdisk_size)
@@ -383,22 +390,18 @@ static void __init reserve_initrd(void)
383 390
384 initrd_start = 0; 391 initrd_start = 0;
385 392
386 if (ramdisk_size >= (end_of_lowmem>>1)) { 393 mapped_size = memblock_mem_size(max_pfn_mapped);
394 if (ramdisk_size >= (mapped_size>>1))
387 panic("initrd too large to handle, " 395 panic("initrd too large to handle, "
388 "disabling initrd (%lld needed, %lld available)\n", 396 "disabling initrd (%lld needed, %lld available)\n",
389 ramdisk_size, end_of_lowmem>>1); 397 ramdisk_size, mapped_size>>1);
390 }
391 398
392 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, 399 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
393 ramdisk_end - 1); 400 ramdisk_end - 1);
394 401
395 402 if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
396 if (ramdisk_end <= end_of_lowmem) { 403 PFN_DOWN(ramdisk_end))) {
397 /* All in lowmem, easy case */ 404 /* All are mapped, easy case */
398 /*
399 * don't need to reserve again, already reserved early
400 * in i386_start_kernel
401 */
402 initrd_start = ramdisk_image + PAGE_OFFSET; 405 initrd_start = ramdisk_image + PAGE_OFFSET;
403 initrd_end = initrd_start + ramdisk_size; 406 initrd_end = initrd_start + ramdisk_size;
404 return; 407 return;
@@ -409,6 +412,9 @@ static void __init reserve_initrd(void)
409 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); 412 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
410} 413}
411#else 414#else
415static void __init early_reserve_initrd(void)
416{
417}
412static void __init reserve_initrd(void) 418static void __init reserve_initrd(void)
413{ 419{
414} 420}
@@ -419,8 +425,6 @@ static void __init parse_setup_data(void)
419 struct setup_data *data; 425 struct setup_data *data;
420 u64 pa_data; 426 u64 pa_data;
421 427
422 if (boot_params.hdr.version < 0x0209)
423 return;
424 pa_data = boot_params.hdr.setup_data; 428 pa_data = boot_params.hdr.setup_data;
425 while (pa_data) { 429 while (pa_data) {
426 u32 data_len, map_len; 430 u32 data_len, map_len;
@@ -456,8 +460,6 @@ static void __init e820_reserve_setup_data(void)
456 u64 pa_data; 460 u64 pa_data;
457 int found = 0; 461 int found = 0;
458 462
459 if (boot_params.hdr.version < 0x0209)
460 return;
461 pa_data = boot_params.hdr.setup_data; 463 pa_data = boot_params.hdr.setup_data;
462 while (pa_data) { 464 while (pa_data) {
463 data = early_memremap(pa_data, sizeof(*data)); 465 data = early_memremap(pa_data, sizeof(*data));
@@ -481,8 +483,6 @@ static void __init memblock_x86_reserve_range_setup_data(void)
481 struct setup_data *data; 483 struct setup_data *data;
482 u64 pa_data; 484 u64 pa_data;
483 485
484 if (boot_params.hdr.version < 0x0209)
485 return;
486 pa_data = boot_params.hdr.setup_data; 486 pa_data = boot_params.hdr.setup_data;
487 while (pa_data) { 487 while (pa_data) {
488 data = early_memremap(pa_data, sizeof(*data)); 488 data = early_memremap(pa_data, sizeof(*data));
@@ -501,17 +501,51 @@ static void __init memblock_x86_reserve_range_setup_data(void)
501/* 501/*
502 * Keep the crash kernel below this limit. On 32 bits earlier kernels 502 * Keep the crash kernel below this limit. On 32 bits earlier kernels
503 * would limit the kernel to the low 512 MiB due to mapping restrictions. 503 * would limit the kernel to the low 512 MiB due to mapping restrictions.
504 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
505 * limit once kexec-tools are fixed.
506 */ 504 */
507#ifdef CONFIG_X86_32 505#ifdef CONFIG_X86_32
508# define CRASH_KERNEL_ADDR_MAX (512 << 20) 506# define CRASH_KERNEL_ADDR_MAX (512 << 20)
509#else 507#else
510# define CRASH_KERNEL_ADDR_MAX (896 << 20) 508# define CRASH_KERNEL_ADDR_MAX MAXMEM
511#endif 509#endif
512 510
511static void __init reserve_crashkernel_low(void)
512{
513#ifdef CONFIG_X86_64
514 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long low_base = 0, low_size = 0;
516 unsigned long total_low_mem;
517 unsigned long long base;
518 int ret;
519
520 total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
521 ret = parse_crashkernel_low(boot_command_line, total_low_mem,
522 &low_size, &base);
523 if (ret != 0 || low_size <= 0)
524 return;
525
526 low_base = memblock_find_in_range(low_size, (1ULL<<32),
527 low_size, alignment);
528
529 if (!low_base) {
530 pr_info("crashkernel low reservation failed - No suitable area found.\n");
531
532 return;
533 }
534
535 memblock_reserve(low_base, low_size);
536 pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n",
537 (unsigned long)(low_size >> 20),
538 (unsigned long)(low_base >> 20),
539 (unsigned long)(total_low_mem >> 20));
540 crashk_low_res.start = low_base;
541 crashk_low_res.end = low_base + low_size - 1;
542 insert_resource(&iomem_resource, &crashk_low_res);
543#endif
544}
545
513static void __init reserve_crashkernel(void) 546static void __init reserve_crashkernel(void)
514{ 547{
548 const unsigned long long alignment = 16<<20; /* 16M */
515 unsigned long long total_mem; 549 unsigned long long total_mem;
516 unsigned long long crash_size, crash_base; 550 unsigned long long crash_size, crash_base;
517 int ret; 551 int ret;
@@ -525,8 +559,6 @@ static void __init reserve_crashkernel(void)
525 559
526 /* 0 means: find the address automatically */ 560 /* 0 means: find the address automatically */
527 if (crash_base <= 0) { 561 if (crash_base <= 0) {
528 const unsigned long long alignment = 16<<20; /* 16M */
529
530 /* 562 /*
531 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX 563 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
532 */ 564 */
@@ -537,6 +569,7 @@ static void __init reserve_crashkernel(void)
537 pr_info("crashkernel reservation failed - No suitable area found.\n"); 569 pr_info("crashkernel reservation failed - No suitable area found.\n");
538 return; 570 return;
539 } 571 }
572
540 } else { 573 } else {
541 unsigned long long start; 574 unsigned long long start;
542 575
@@ -558,6 +591,9 @@ static void __init reserve_crashkernel(void)
558 crashk_res.start = crash_base; 591 crashk_res.start = crash_base;
559 crashk_res.end = crash_base + crash_size - 1; 592 crashk_res.end = crash_base + crash_size - 1;
560 insert_resource(&iomem_resource, &crashk_res); 593 insert_resource(&iomem_resource, &crashk_res);
594
595 if (crash_base >= (1ULL<<32))
596 reserve_crashkernel_low();
561} 597}
562#else 598#else
563static void __init reserve_crashkernel(void) 599static void __init reserve_crashkernel(void)
@@ -608,7 +644,82 @@ static __init void reserve_ibft_region(void)
608 memblock_reserve(addr, size); 644 memblock_reserve(addr, size);
609} 645}
610 646
611static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; 647static bool __init snb_gfx_workaround_needed(void)
648{
649#ifdef CONFIG_PCI
650 int i;
651 u16 vendor, devid;
652 static const __initconst u16 snb_ids[] = {
653 0x0102,
654 0x0112,
655 0x0122,
656 0x0106,
657 0x0116,
658 0x0126,
659 0x010a,
660 };
661
662 /* Assume no if something weird is going on with PCI */
663 if (!early_pci_allowed())
664 return false;
665
666 vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
667 if (vendor != 0x8086)
668 return false;
669
670 devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
671 for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
672 if (devid == snb_ids[i])
673 return true;
674#endif
675
676 return false;
677}
678
679/*
680 * Sandy Bridge graphics has trouble with certain ranges, exclude
681 * them from allocation.
682 */
683static void __init trim_snb_memory(void)
684{
685 static const __initconst unsigned long bad_pages[] = {
686 0x20050000,
687 0x20110000,
688 0x20130000,
689 0x20138000,
690 0x40004000,
691 };
692 int i;
693
694 if (!snb_gfx_workaround_needed())
695 return;
696
697 printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
698
699 /*
700 * Reserve all memory below the 1 MB mark that has not
701 * already been reserved.
702 */
703 memblock_reserve(0, 1<<20);
704
705 for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
706 if (memblock_reserve(bad_pages[i], PAGE_SIZE))
707 printk(KERN_WARNING "failed to reserve 0x%08lx\n",
708 bad_pages[i]);
709 }
710}
711
712/*
713 * Here we put platform-specific memory range workarounds, i.e.
714 * memory known to be corrupt or otherwise in need to be reserved on
715 * specific platforms.
716 *
717 * If this gets used more widely it could use a real dispatch mechanism.
718 */
719static void __init trim_platform_memory_ranges(void)
720{
721 trim_snb_memory();
722}
612 723
613static void __init trim_bios_range(void) 724static void __init trim_bios_range(void)
614{ 725{
@@ -621,8 +732,7 @@ static void __init trim_bios_range(void)
621 * since some BIOSes are known to corrupt low memory. See the 732 * since some BIOSes are known to corrupt low memory. See the
622 * Kconfig help text for X86_RESERVE_LOW. 733 * Kconfig help text for X86_RESERVE_LOW.
623 */ 734 */
624 e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), 735 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
625 E820_RAM, E820_RESERVED);
626 736
627 /* 737 /*
628 * special case: Some BIOSen report the PC BIOS 738 * special case: Some BIOSen report the PC BIOS
@@ -630,9 +740,33 @@ static void __init trim_bios_range(void)
630 * take them out. 740 * take them out.
631 */ 741 */
632 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); 742 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
743
633 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 744 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
634} 745}
635 746
747/* called before trim_bios_range() to spare extra sanitize */
748static void __init e820_add_kernel_range(void)
749{
750 u64 start = __pa_symbol(_text);
751 u64 size = __pa_symbol(_end) - start;
752
753 /*
754 * Complain if .text .data and .bss are not marked as E820_RAM and
755 * attempt to fix it by adding the range. We may have a confused BIOS,
756 * or the user may have used memmap=exactmap or memmap=xxM$yyM to
757 * exclude kernel range. If we really are running on top non-RAM,
758 * we will crash later anyways.
759 */
760 if (e820_all_mapped(start, start + size, E820_RAM))
761 return;
762
763 pr_warn(".text .data .bss are not marked as E820_RAM!\n");
764 e820_remove_range(start, size, E820_RAM, 0);
765 e820_add_region(start, size, E820_RAM);
766}
767
768static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
769
636static int __init parse_reservelow(char *p) 770static int __init parse_reservelow(char *p)
637{ 771{
638 unsigned long long size; 772 unsigned long long size;
@@ -655,6 +789,11 @@ static int __init parse_reservelow(char *p)
655 789
656early_param("reservelow", parse_reservelow); 790early_param("reservelow", parse_reservelow);
657 791
792static void __init trim_low_memory_range(void)
793{
794 memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
795}
796
658/* 797/*
659 * Determine if we were loaded by an EFI loader. If so, then we have also been 798 * Determine if we were loaded by an EFI loader. If so, then we have also been
660 * passed the efi memmap, systab, etc., so we should use these data structures 799 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -670,6 +809,17 @@ early_param("reservelow", parse_reservelow);
670 809
671void __init setup_arch(char **cmdline_p) 810void __init setup_arch(char **cmdline_p)
672{ 811{
812 memblock_reserve(__pa_symbol(_text),
813 (unsigned long)__bss_stop - (unsigned long)_text);
814
815 early_reserve_initrd();
816
817 /*
818 * At this point everything still needed from the boot loader
819 * or BIOS or kernel text should be early reserved or marked not
820 * RAM in e820. All other memory is free game.
821 */
822
673#ifdef CONFIG_X86_32 823#ifdef CONFIG_X86_32
674 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 824 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
675 visws_early_detect(); 825 visws_early_detect();
@@ -729,15 +879,15 @@ void __init setup_arch(char **cmdline_p)
729#ifdef CONFIG_EFI 879#ifdef CONFIG_EFI
730 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 880 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
731 "EL32", 4)) { 881 "EL32", 4)) {
732 efi_enabled = 1; 882 set_bit(EFI_BOOT, &x86_efi_facility);
733 efi_64bit = false;
734 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 883 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
735 "EL64", 4)) { 884 "EL64", 4)) {
736 efi_enabled = 1; 885 set_bit(EFI_BOOT, &x86_efi_facility);
737 efi_64bit = true; 886 set_bit(EFI_64BIT, &x86_efi_facility);
738 } 887 }
739 if (efi_enabled && efi_memblock_x86_reserve_range()) 888
740 efi_enabled = 0; 889 if (efi_enabled(EFI_BOOT))
890 efi_memblock_x86_reserve_range();
741#endif 891#endif
742 892
743 x86_init.oem.arch_setup(); 893 x86_init.oem.arch_setup();
@@ -757,12 +907,12 @@ void __init setup_arch(char **cmdline_p)
757 init_mm.end_data = (unsigned long) _edata; 907 init_mm.end_data = (unsigned long) _edata;
758 init_mm.brk = _brk_end; 908 init_mm.brk = _brk_end;
759 909
760 code_resource.start = virt_to_phys(_text); 910 code_resource.start = __pa_symbol(_text);
761 code_resource.end = virt_to_phys(_etext)-1; 911 code_resource.end = __pa_symbol(_etext)-1;
762 data_resource.start = virt_to_phys(_etext); 912 data_resource.start = __pa_symbol(_etext);
763 data_resource.end = virt_to_phys(_edata)-1; 913 data_resource.end = __pa_symbol(_edata)-1;
764 bss_resource.start = virt_to_phys(&__bss_start); 914 bss_resource.start = __pa_symbol(__bss_start);
765 bss_resource.end = virt_to_phys(&__bss_stop)-1; 915 bss_resource.end = __pa_symbol(__bss_stop)-1;
766 916
767#ifdef CONFIG_CMDLINE_BOOL 917#ifdef CONFIG_CMDLINE_BOOL
768#ifdef CONFIG_CMDLINE_OVERRIDE 918#ifdef CONFIG_CMDLINE_OVERRIDE
@@ -810,7 +960,7 @@ void __init setup_arch(char **cmdline_p)
810 960
811 finish_e820_parsing(); 961 finish_e820_parsing();
812 962
813 if (efi_enabled) 963 if (efi_enabled(EFI_BOOT))
814 efi_init(); 964 efi_init();
815 965
816 dmi_scan_machine(); 966 dmi_scan_machine();
@@ -828,6 +978,7 @@ void __init setup_arch(char **cmdline_p)
828 insert_resource(&iomem_resource, &data_resource); 978 insert_resource(&iomem_resource, &data_resource);
829 insert_resource(&iomem_resource, &bss_resource); 979 insert_resource(&iomem_resource, &bss_resource);
830 980
981 e820_add_kernel_range();
831 trim_bios_range(); 982 trim_bios_range();
832#ifdef CONFIG_X86_32 983#ifdef CONFIG_X86_32
833 if (ppro_with_ram_bug()) { 984 if (ppro_with_ram_bug()) {
@@ -877,6 +1028,8 @@ void __init setup_arch(char **cmdline_p)
877 1028
878 reserve_ibft_region(); 1029 reserve_ibft_region();
879 1030
1031 early_alloc_pgt_buf();
1032
880 /* 1033 /*
881 * Need to conclude brk, before memblock_x86_fill() 1034 * Need to conclude brk, before memblock_x86_fill()
882 * it could use memblock_find_in_range, could overlap with 1035 * it could use memblock_find_in_range, could overlap with
@@ -886,14 +1039,14 @@ void __init setup_arch(char **cmdline_p)
886 1039
887 cleanup_highmap(); 1040 cleanup_highmap();
888 1041
889 memblock.current_limit = get_max_mapped(); 1042 memblock.current_limit = ISA_END_ADDRESS;
890 memblock_x86_fill(); 1043 memblock_x86_fill();
891 1044
892 /* 1045 /*
893 * The EFI specification says that boot service code won't be called 1046 * The EFI specification says that boot service code won't be called
894 * after ExitBootServices(). This is, in fact, a lie. 1047 * after ExitBootServices(). This is, in fact, a lie.
895 */ 1048 */
896 if (efi_enabled) 1049 if (efi_enabled(EFI_MEMMAP))
897 efi_reserve_boot_services(); 1050 efi_reserve_boot_services();
898 1051
899 /* preallocate 4k for mptable mpc */ 1052 /* preallocate 4k for mptable mpc */
@@ -903,39 +1056,22 @@ void __init setup_arch(char **cmdline_p)
903 setup_bios_corruption_check(); 1056 setup_bios_corruption_check();
904#endif 1057#endif
905 1058
1059#ifdef CONFIG_X86_32
906 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1060 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
907 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1061 (max_pfn_mapped<<PAGE_SHIFT) - 1);
1062#endif
908 1063
909 setup_real_mode(); 1064 reserve_real_mode();
910
911 init_gbpages();
912
913 /* max_pfn_mapped is updated here */
914 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
915 max_pfn_mapped = max_low_pfn_mapped;
916 1065
917#ifdef CONFIG_X86_64 1066 trim_platform_memory_ranges();
918 if (max_pfn > max_low_pfn) { 1067 trim_low_memory_range();
919 int i;
920 unsigned long start, end;
921 unsigned long start_pfn, end_pfn;
922 1068
923 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, 1069 init_mem_mapping();
924 NULL) {
925 1070
926 end = PFN_PHYS(end_pfn); 1071 early_trap_pf_init();
927 if (end <= (1UL<<32))
928 continue;
929 1072
930 start = PFN_PHYS(start_pfn); 1073 setup_real_mode();
931 max_pfn_mapped = init_memory_mapping(
932 max((1UL<<32), start), end);
933 }
934 1074
935 /* can we preseve max_low_pfn ?*/
936 max_low_pfn = max_pfn;
937 }
938#endif
939 memblock.current_limit = get_max_mapped(); 1075 memblock.current_limit = get_max_mapped();
940 dma_contiguous_reserve(0); 1076 dma_contiguous_reserve(0);
941 1077
@@ -952,6 +1088,10 @@ void __init setup_arch(char **cmdline_p)
952 1088
953 reserve_initrd(); 1089 reserve_initrd();
954 1090
1091#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD)
1092 acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start);
1093#endif
1094
955 reserve_crashkernel(); 1095 reserve_crashkernel();
956 1096
957 vsmp_init(); 1097 vsmp_init();
@@ -1030,7 +1170,7 @@ void __init setup_arch(char **cmdline_p)
1030 1170
1031#ifdef CONFIG_VT 1171#ifdef CONFIG_VT
1032#if defined(CONFIG_VGA_CONSOLE) 1172#if defined(CONFIG_VGA_CONSOLE)
1033 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) 1173 if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
1034 conswitchp = &vga_con; 1174 conswitchp = &vga_con;
1035#elif defined(CONFIG_DUMMY_CONSOLE) 1175#elif defined(CONFIG_DUMMY_CONSOLE)
1036 conswitchp = &dummy_con; 1176 conswitchp = &dummy_con;
@@ -1047,14 +1187,13 @@ void __init setup_arch(char **cmdline_p)
1047 register_refined_jiffies(CLOCK_TICK_RATE); 1187 register_refined_jiffies(CLOCK_TICK_RATE);
1048 1188
1049#ifdef CONFIG_EFI 1189#ifdef CONFIG_EFI
1050 /* Once setup is done above, disable efi_enabled on mismatched 1190 /* Once setup is done above, unmap the EFI memory map on
1051 * firmware/kernel archtectures since there is no support for 1191 * mismatched firmware/kernel archtectures since there is no
1052 * runtime services. 1192 * support for runtime services.
1053 */ 1193 */
1054 if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { 1194 if (efi_enabled(EFI_BOOT) && !efi_is_native()) {
1055 pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); 1195 pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
1056 efi_unmap_memmap(); 1196 efi_unmap_memmap();
1057 efi_enabled = 0;
1058 } 1197 }
1059#endif 1198#endif
1060} 1199}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index fbbb604313a2..69562992e457 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -278,7 +278,7 @@ static const struct {
278}; 278};
279 279
280static int 280static int
281__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 281__setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
282 struct pt_regs *regs) 282 struct pt_regs *regs)
283{ 283{
284 struct sigframe __user *frame; 284 struct sigframe __user *frame;
@@ -286,7 +286,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
286 int err = 0; 286 int err = 0;
287 void __user *fpstate = NULL; 287 void __user *fpstate = NULL;
288 288
289 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 289 frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
290 290
291 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 291 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
292 return -EFAULT; 292 return -EFAULT;
@@ -307,8 +307,8 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
307 restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); 307 restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
308 else 308 else
309 restorer = &frame->retcode; 309 restorer = &frame->retcode;
310 if (ka->sa.sa_flags & SA_RESTORER) 310 if (ksig->ka.sa.sa_flags & SA_RESTORER)
311 restorer = ka->sa.sa_restorer; 311 restorer = ksig->ka.sa.sa_restorer;
312 312
313 /* Set up to return from userspace. */ 313 /* Set up to return from userspace. */
314 err |= __put_user(restorer, &frame->pretcode); 314 err |= __put_user(restorer, &frame->pretcode);
@@ -327,7 +327,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
327 327
328 /* Set up registers for signal handler */ 328 /* Set up registers for signal handler */
329 regs->sp = (unsigned long)frame; 329 regs->sp = (unsigned long)frame;
330 regs->ip = (unsigned long)ka->sa.sa_handler; 330 regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
331 regs->ax = (unsigned long)sig; 331 regs->ax = (unsigned long)sig;
332 regs->dx = 0; 332 regs->dx = 0;
333 regs->cx = 0; 333 regs->cx = 0;
@@ -340,7 +340,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
340 return 0; 340 return 0;
341} 341}
342 342
343static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 343static int __setup_rt_frame(int sig, struct ksignal *ksig,
344 sigset_t *set, struct pt_regs *regs) 344 sigset_t *set, struct pt_regs *regs)
345{ 345{
346 struct rt_sigframe __user *frame; 346 struct rt_sigframe __user *frame;
@@ -348,7 +348,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
348 int err = 0; 348 int err = 0;
349 void __user *fpstate = NULL; 349 void __user *fpstate = NULL;
350 350
351 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 351 frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
352 352
353 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 353 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
354 return -EFAULT; 354 return -EFAULT;
@@ -364,15 +364,12 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
364 else 364 else
365 put_user_ex(0, &frame->uc.uc_flags); 365 put_user_ex(0, &frame->uc.uc_flags);
366 put_user_ex(0, &frame->uc.uc_link); 366 put_user_ex(0, &frame->uc.uc_link);
367 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 367 err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
368 put_user_ex(sas_ss_flags(regs->sp),
369 &frame->uc.uc_stack.ss_flags);
370 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
371 368
372 /* Set up to return from userspace. */ 369 /* Set up to return from userspace. */
373 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 370 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
374 if (ka->sa.sa_flags & SA_RESTORER) 371 if (ksig->ka.sa.sa_flags & SA_RESTORER)
375 restorer = ka->sa.sa_restorer; 372 restorer = ksig->ka.sa.sa_restorer;
376 put_user_ex(restorer, &frame->pretcode); 373 put_user_ex(restorer, &frame->pretcode);
377 374
378 /* 375 /*
@@ -385,7 +382,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
385 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); 382 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
386 } put_user_catch(err); 383 } put_user_catch(err);
387 384
388 err |= copy_siginfo_to_user(&frame->info, info); 385 err |= copy_siginfo_to_user(&frame->info, &ksig->info);
389 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, 386 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
390 regs, set->sig[0]); 387 regs, set->sig[0]);
391 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 388 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
@@ -395,7 +392,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
395 392
396 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
397 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
398 regs->ip = (unsigned long)ka->sa.sa_handler; 395 regs->ip = (unsigned long)ksig->ka.sa.sa_handler;
399 regs->ax = (unsigned long)sig; 396 regs->ax = (unsigned long)sig;
400 regs->dx = (unsigned long)&frame->info; 397 regs->dx = (unsigned long)&frame->info;
401 regs->cx = (unsigned long)&frame->uc; 398 regs->cx = (unsigned long)&frame->uc;
@@ -408,21 +405,20 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
408 return 0; 405 return 0;
409} 406}
410#else /* !CONFIG_X86_32 */ 407#else /* !CONFIG_X86_32 */
411static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct ksignal *ksig,
412 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
413{ 410{
414 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
415 void __user *fp = NULL; 412 void __user *fp = NULL;
416 int err = 0; 413 int err = 0;
417 struct task_struct *me = current;
418 414
419 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); 415 frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp);
420 416
421 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 417 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
422 return -EFAULT; 418 return -EFAULT;
423 419
424 if (ka->sa.sa_flags & SA_SIGINFO) { 420 if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
425 if (copy_siginfo_to_user(&frame->info, info)) 421 if (copy_siginfo_to_user(&frame->info, &ksig->info))
426 return -EFAULT; 422 return -EFAULT;
427 } 423 }
428 424
@@ -433,16 +429,13 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
433 else 429 else
434 put_user_ex(0, &frame->uc.uc_flags); 430 put_user_ex(0, &frame->uc.uc_flags);
435 put_user_ex(0, &frame->uc.uc_link); 431 put_user_ex(0, &frame->uc.uc_link);
436 put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 432 err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
437 put_user_ex(sas_ss_flags(regs->sp),
438 &frame->uc.uc_stack.ss_flags);
439 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
440 433
441 /* Set up to return from userspace. If provided, use a stub 434 /* Set up to return from userspace. If provided, use a stub
442 already in userspace. */ 435 already in userspace. */
443 /* x86-64 should always use SA_RESTORER. */ 436 /* x86-64 should always use SA_RESTORER. */
444 if (ka->sa.sa_flags & SA_RESTORER) { 437 if (ksig->ka.sa.sa_flags & SA_RESTORER) {
445 put_user_ex(ka->sa.sa_restorer, &frame->pretcode); 438 put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode);
446 } else { 439 } else {
447 /* could use a vstub here */ 440 /* could use a vstub here */
448 err |= -EFAULT; 441 err |= -EFAULT;
@@ -464,7 +457,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
464 next argument after the signal number on the stack. */ 457 next argument after the signal number on the stack. */
465 regs->si = (unsigned long)&frame->info; 458 regs->si = (unsigned long)&frame->info;
466 regs->dx = (unsigned long)&frame->uc; 459 regs->dx = (unsigned long)&frame->uc;
467 regs->ip = (unsigned long) ka->sa.sa_handler; 460 regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
468 461
469 regs->sp = (unsigned long)frame; 462 regs->sp = (unsigned long)frame;
470 463
@@ -476,8 +469,8 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
476} 469}
477#endif /* CONFIG_X86_32 */ 470#endif /* CONFIG_X86_32 */
478 471
479static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, 472static int x32_setup_rt_frame(struct ksignal *ksig,
480 siginfo_t *info, compat_sigset_t *set, 473 compat_sigset_t *set,
481 struct pt_regs *regs) 474 struct pt_regs *regs)
482{ 475{
483#ifdef CONFIG_X86_X32_ABI 476#ifdef CONFIG_X86_X32_ABI
@@ -486,13 +479,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
486 int err = 0; 479 int err = 0;
487 void __user *fpstate = NULL; 480 void __user *fpstate = NULL;
488 481
489 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); 482 frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate);
490 483
491 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 484 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
492 return -EFAULT; 485 return -EFAULT;
493 486
494 if (ka->sa.sa_flags & SA_SIGINFO) { 487 if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
495 if (copy_siginfo_to_user32(&frame->info, info)) 488 if (copy_siginfo_to_user32(&frame->info, &ksig->info))
496 return -EFAULT; 489 return -EFAULT;
497 } 490 }
498 491
@@ -503,14 +496,11 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
503 else 496 else
504 put_user_ex(0, &frame->uc.uc_flags); 497 put_user_ex(0, &frame->uc.uc_flags);
505 put_user_ex(0, &frame->uc.uc_link); 498 put_user_ex(0, &frame->uc.uc_link);
506 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 499 err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
507 put_user_ex(sas_ss_flags(regs->sp),
508 &frame->uc.uc_stack.ss_flags);
509 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
510 put_user_ex(0, &frame->uc.uc__pad0); 500 put_user_ex(0, &frame->uc.uc__pad0);
511 501
512 if (ka->sa.sa_flags & SA_RESTORER) { 502 if (ksig->ka.sa.sa_flags & SA_RESTORER) {
513 restorer = ka->sa.sa_restorer; 503 restorer = ksig->ka.sa.sa_restorer;
514 } else { 504 } else {
515 /* could use a vstub here */ 505 /* could use a vstub here */
516 restorer = NULL; 506 restorer = NULL;
@@ -528,10 +518,10 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
528 518
529 /* Set up registers for signal handler */ 519 /* Set up registers for signal handler */
530 regs->sp = (unsigned long) frame; 520 regs->sp = (unsigned long) frame;
531 regs->ip = (unsigned long) ka->sa.sa_handler; 521 regs->ip = (unsigned long) ksig->ka.sa.sa_handler;
532 522
533 /* We use the x32 calling convention here... */ 523 /* We use the x32 calling convention here... */
534 regs->di = sig; 524 regs->di = ksig->sig;
535 regs->si = (unsigned long) &frame->info; 525 regs->si = (unsigned long) &frame->info;
536 regs->dx = (unsigned long) &frame->uc; 526 regs->dx = (unsigned long) &frame->uc;
537 527
@@ -545,77 +535,13 @@ static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
545 return 0; 535 return 0;
546} 536}
547 537
548#ifdef CONFIG_X86_32
549/*
550 * Atomically swap in the new signal mask, and wait for a signal.
551 */
552asmlinkage int
553sys_sigsuspend(int history0, int history1, old_sigset_t mask)
554{
555 sigset_t blocked;
556 siginitset(&blocked, mask);
557 return sigsuspend(&blocked);
558}
559
560asmlinkage int
561sys_sigaction(int sig, const struct old_sigaction __user *act,
562 struct old_sigaction __user *oact)
563{
564 struct k_sigaction new_ka, old_ka;
565 int ret = 0;
566
567 if (act) {
568 old_sigset_t mask;
569
570 if (!access_ok(VERIFY_READ, act, sizeof(*act)))
571 return -EFAULT;
572
573 get_user_try {
574 get_user_ex(new_ka.sa.sa_handler, &act->sa_handler);
575 get_user_ex(new_ka.sa.sa_flags, &act->sa_flags);
576 get_user_ex(mask, &act->sa_mask);
577 get_user_ex(new_ka.sa.sa_restorer, &act->sa_restorer);
578 } get_user_catch(ret);
579
580 if (ret)
581 return -EFAULT;
582 siginitset(&new_ka.sa.sa_mask, mask);
583 }
584
585 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
586
587 if (!ret && oact) {
588 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)))
589 return -EFAULT;
590
591 put_user_try {
592 put_user_ex(old_ka.sa.sa_handler, &oact->sa_handler);
593 put_user_ex(old_ka.sa.sa_flags, &oact->sa_flags);
594 put_user_ex(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
595 put_user_ex(old_ka.sa.sa_restorer, &oact->sa_restorer);
596 } put_user_catch(ret);
597
598 if (ret)
599 return -EFAULT;
600 }
601
602 return ret;
603}
604#endif /* CONFIG_X86_32 */
605
606long
607sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
608 struct pt_regs *regs)
609{
610 return do_sigaltstack(uss, uoss, regs->sp);
611}
612
613/* 538/*
614 * Do a signal return; undo the signal stack. 539 * Do a signal return; undo the signal stack.
615 */ 540 */
616#ifdef CONFIG_X86_32 541#ifdef CONFIG_X86_32
617unsigned long sys_sigreturn(struct pt_regs *regs) 542unsigned long sys_sigreturn(void)
618{ 543{
544 struct pt_regs *regs = current_pt_regs();
619 struct sigframe __user *frame; 545 struct sigframe __user *frame;
620 unsigned long ax; 546 unsigned long ax;
621 sigset_t set; 547 sigset_t set;
@@ -642,8 +568,9 @@ badframe:
642} 568}
643#endif /* CONFIG_X86_32 */ 569#endif /* CONFIG_X86_32 */
644 570
645long sys_rt_sigreturn(struct pt_regs *regs) 571long sys_rt_sigreturn(void)
646{ 572{
573 struct pt_regs *regs = current_pt_regs();
647 struct rt_sigframe __user *frame; 574 struct rt_sigframe __user *frame;
648 unsigned long ax; 575 unsigned long ax;
649 sigset_t set; 576 sigset_t set;
@@ -659,7 +586,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
659 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 586 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
660 goto badframe; 587 goto badframe;
661 588
662 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) 589 if (restore_altstack(&frame->uc.uc_stack))
663 goto badframe; 590 goto badframe;
664 591
665 return ax; 592 return ax;
@@ -684,30 +611,29 @@ static int signr_convert(int sig)
684} 611}
685 612
686static int 613static int
687setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 614setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
688 struct pt_regs *regs)
689{ 615{
690 int usig = signr_convert(sig); 616 int usig = signr_convert(ksig->sig);
691 sigset_t *set = sigmask_to_save(); 617 sigset_t *set = sigmask_to_save();
692 compat_sigset_t *cset = (compat_sigset_t *) set; 618 compat_sigset_t *cset = (compat_sigset_t *) set;
693 619
694 /* Set up the stack frame */ 620 /* Set up the stack frame */
695 if (is_ia32_frame()) { 621 if (is_ia32_frame()) {
696 if (ka->sa.sa_flags & SA_SIGINFO) 622 if (ksig->ka.sa.sa_flags & SA_SIGINFO)
697 return ia32_setup_rt_frame(usig, ka, info, cset, regs); 623 return ia32_setup_rt_frame(usig, ksig, cset, regs);
698 else 624 else
699 return ia32_setup_frame(usig, ka, cset, regs); 625 return ia32_setup_frame(usig, ksig, cset, regs);
700 } else if (is_x32_frame()) { 626 } else if (is_x32_frame()) {
701 return x32_setup_rt_frame(usig, ka, info, cset, regs); 627 return x32_setup_rt_frame(ksig, cset, regs);
702 } else { 628 } else {
703 return __setup_rt_frame(sig, ka, info, set, regs); 629 return __setup_rt_frame(ksig->sig, ksig, set, regs);
704 } 630 }
705} 631}
706 632
707static void 633static void
708handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 634handle_signal(struct ksignal *ksig, struct pt_regs *regs)
709 struct pt_regs *regs)
710{ 635{
636 bool failed;
711 /* Are we from a system call? */ 637 /* Are we from a system call? */
712 if (syscall_get_nr(current, regs) >= 0) { 638 if (syscall_get_nr(current, regs) >= 0) {
713 /* If so, check system call restarting.. */ 639 /* If so, check system call restarting.. */
@@ -718,7 +644,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
718 break; 644 break;
719 645
720 case -ERESTARTSYS: 646 case -ERESTARTSYS:
721 if (!(ka->sa.sa_flags & SA_RESTART)) { 647 if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
722 regs->ax = -EINTR; 648 regs->ax = -EINTR;
723 break; 649 break;
724 } 650 }
@@ -738,26 +664,21 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
738 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 664 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
739 regs->flags &= ~X86_EFLAGS_TF; 665 regs->flags &= ~X86_EFLAGS_TF;
740 666
741 if (setup_rt_frame(sig, ka, info, regs) < 0) { 667 failed = (setup_rt_frame(ksig, regs) < 0);
742 force_sigsegv(sig, current); 668 if (!failed) {
743 return; 669 /*
670 * Clear the direction flag as per the ABI for function entry.
671 */
672 regs->flags &= ~X86_EFLAGS_DF;
673 /*
674 * Clear TF when entering the signal handler, but
675 * notify any tracer that was single-stepping it.
676 * The tracer may want to single-step inside the
677 * handler too.
678 */
679 regs->flags &= ~X86_EFLAGS_TF;
744 } 680 }
745 681 signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
746 /*
747 * Clear the direction flag as per the ABI for function entry.
748 */
749 regs->flags &= ~X86_EFLAGS_DF;
750
751 /*
752 * Clear TF when entering the signal handler, but
753 * notify any tracer that was single-stepping it.
754 * The tracer may want to single-step inside the
755 * handler too.
756 */
757 regs->flags &= ~X86_EFLAGS_TF;
758
759 signal_delivered(sig, info, ka, regs,
760 test_thread_flag(TIF_SINGLESTEP));
761} 682}
762 683
763#ifdef CONFIG_X86_32 684#ifdef CONFIG_X86_32
@@ -774,14 +695,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
774 */ 695 */
775static void do_signal(struct pt_regs *regs) 696static void do_signal(struct pt_regs *regs)
776{ 697{
777 struct k_sigaction ka; 698 struct ksignal ksig;
778 siginfo_t info;
779 int signr;
780 699
781 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 700 if (get_signal(&ksig)) {
782 if (signr > 0) {
783 /* Whee! Actually deliver the signal. */ 701 /* Whee! Actually deliver the signal. */
784 handle_signal(signr, &info, &ka, regs); 702 handle_signal(&ksig, regs);
785 return; 703 return;
786 } 704 }
787 705
@@ -860,12 +778,12 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
860} 778}
861 779
862#ifdef CONFIG_X86_X32_ABI 780#ifdef CONFIG_X86_X32_ABI
863asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs) 781asmlinkage long sys32_x32_rt_sigreturn(void)
864{ 782{
783 struct pt_regs *regs = current_pt_regs();
865 struct rt_sigframe_x32 __user *frame; 784 struct rt_sigframe_x32 __user *frame;
866 sigset_t set; 785 sigset_t set;
867 unsigned long ax; 786 unsigned long ax;
868 struct pt_regs tregs;
869 787
870 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); 788 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
871 789
@@ -879,8 +797,7 @@ asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
879 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 797 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
880 goto badframe; 798 goto badframe;
881 799
882 tregs = *regs; 800 if (compat_restore_altstack(&frame->uc.uc_stack))
883 if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
884 goto badframe; 801 goto badframe;
885 802
886 return ax; 803 return ax;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ed0fe385289d..a6ceaedc396a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1369,7 +1369,7 @@ static inline void mwait_play_dead(void)
1369 void *mwait_ptr; 1369 void *mwait_ptr;
1370 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); 1370 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1371 1371
1372 if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))) 1372 if (!this_cpu_has(X86_FEATURE_MWAIT))
1373 return; 1373 return;
1374 if (!this_cpu_has(X86_FEATURE_CLFLSH)) 1374 if (!this_cpu_has(X86_FEATURE_CLFLSH))
1375 return; 1375 return;
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index cd3b2438a980..9b4d51d0c0d0 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -165,10 +165,11 @@ void set_task_blockstep(struct task_struct *task, bool on)
165 * Ensure irq/preemption can't change debugctl in between. 165 * Ensure irq/preemption can't change debugctl in between.
166 * Note also that both TIF_BLOCKSTEP and debugctl should 166 * Note also that both TIF_BLOCKSTEP and debugctl should
167 * be changed atomically wrt preemption. 167 * be changed atomically wrt preemption.
168 * FIXME: this means that set/clear TIF_BLOCKSTEP is simply 168 *
169 * wrong if task != current, SIGKILL can wakeup the stopped 169 * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if
170 * tracee and set/clear can play with the running task, this 170 * task is current or it can't be running, otherwise we can race
171 * can confuse the next __switch_to_xtra(). 171 * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but
172 * PTRACE_KILL is not safe.
172 */ 173 */
173 local_irq_disable(); 174 local_irq_disable();
174 debugctl = get_debugctlmsr(); 175 debugctl = get_debugctlmsr();
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 97ef74b88e0f..dbded5aedb81 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -157,7 +157,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
157 if (flags & MAP_FIXED) 157 if (flags & MAP_FIXED)
158 return addr; 158 return addr;
159 159
160 /* for MAP_32BIT mappings we force the legact mmap base */ 160 /* for MAP_32BIT mappings we force the legacy mmap base */
161 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) 161 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
162 goto bottomup; 162 goto bottomup;
163 163
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index eb8586693e0b..68bda7a84159 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -69,9 +69,6 @@
69 69
70asmlinkage int system_call(void); 70asmlinkage int system_call(void);
71 71
72/* Do we ignore FPU interrupts ? */
73char ignore_fpu_irq;
74
75/* 72/*
76 * The IDT has to be page-aligned to simplify the Pentium 73 * The IDT has to be page-aligned to simplify the Pentium
77 * F0 0F bug workaround. 74 * F0 0F bug workaround.
@@ -564,9 +561,6 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
564 561
565dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 562dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
566{ 563{
567#ifdef CONFIG_X86_32
568 ignore_fpu_irq = 1;
569#endif
570 exception_enter(regs); 564 exception_enter(regs);
571 math_error(regs, error_code, X86_TRAP_MF); 565 math_error(regs, error_code, X86_TRAP_MF);
572 exception_exit(regs); 566 exception_exit(regs);
@@ -694,10 +688,19 @@ void __init early_trap_init(void)
694 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
695 /* int3 can be called from all */ 689 /* int3 can be called from all */
696 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
691#ifdef CONFIG_X86_32
697 set_intr_gate(X86_TRAP_PF, &page_fault); 692 set_intr_gate(X86_TRAP_PF, &page_fault);
693#endif
698 load_idt(&idt_descr); 694 load_idt(&idt_descr);
699} 695}
700 696
697void __init early_trap_pf_init(void)
698{
699#ifdef CONFIG_X86_64
700 set_intr_gate(X86_TRAP_PF, &page_fault);
701#endif
702}
703
701void __init trap_init(void) 704void __init trap_init(void)
702{ 705{
703 int i; 706 int i;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 06ccb5073a3f..4b9ea101fe3b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -623,7 +623,8 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
623 ns_now = __cycles_2_ns(tsc_now); 623 ns_now = __cycles_2_ns(tsc_now);
624 624
625 if (cpu_khz) { 625 if (cpu_khz) {
626 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 626 *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
627 cpu_khz / 2) / cpu_khz;
627 *offset = ns_now - mult_frac(tsc_now, *scale, 628 *offset = ns_now - mult_frac(tsc_now, *scale,
628 (1UL << CYC2NS_SCALE_FACTOR)); 629 (1UL << CYC2NS_SCALE_FACTOR));
629 } 630 }
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index c71025b67462..0ba4cfb4f412 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -680,8 +680,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
680 if (auprobe->insn[i] == 0x66) 680 if (auprobe->insn[i] == 0x66)
681 continue; 681 continue;
682 682
683 if (auprobe->insn[i] == 0x90) 683 if (auprobe->insn[i] == 0x90) {
684 regs->ip += i + 1;
684 return true; 685 return true;
686 }
685 687
686 break; 688 break;
687 } 689 }
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 1dfe69cc78a8..1cf5766dde16 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -202,7 +202,7 @@ out:
202static int do_vm86_irq_handling(int subfunction, int irqnumber); 202static int do_vm86_irq_handling(int subfunction, int irqnumber);
203static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 203static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
204 204
205int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs) 205int sys_vm86old(struct vm86_struct __user *v86)
206{ 206{
207 struct kernel_vm86_struct info; /* declare this _on top_, 207 struct kernel_vm86_struct info; /* declare this _on top_,
208 * this avoids wasting of stack space. 208 * this avoids wasting of stack space.
@@ -222,7 +222,7 @@ int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
222 if (tmp) 222 if (tmp)
223 goto out; 223 goto out;
224 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); 224 memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
225 info.regs32 = regs; 225 info.regs32 = current_pt_regs();
226 tsk->thread.vm86_info = v86; 226 tsk->thread.vm86_info = v86;
227 do_sys_vm86(&info, tsk); 227 do_sys_vm86(&info, tsk);
228 ret = 0; /* we never return here */ 228 ret = 0; /* we never return here */
@@ -231,7 +231,7 @@ out:
231} 231}
232 232
233 233
234int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs) 234int sys_vm86(unsigned long cmd, unsigned long arg)
235{ 235{
236 struct kernel_vm86_struct info; /* declare this _on top_, 236 struct kernel_vm86_struct info; /* declare this _on top_,
237 * this avoids wasting of stack space. 237 * this avoids wasting of stack space.
@@ -272,7 +272,7 @@ int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
272 ret = -EFAULT; 272 ret = -EFAULT;
273 if (tmp) 273 if (tmp)
274 goto out; 274 goto out;
275 info.regs32 = regs; 275 info.regs32 = current_pt_regs();
276 info.vm86plus.is_vm86pus = 1; 276 info.vm86plus.is_vm86pus = 1;
277 tsk->thread.vm86_info = (struct vm86_struct __user *)v86; 277 tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
278 do_sys_vm86(&info, tsk); 278 do_sys_vm86(&info, tsk);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3a3e8c9e280d..9a907a67be8f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
145 return nr; 145 return nr;
146} 146}
147 147
148#ifdef CONFIG_SECCOMP
149static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
150{
151 if (!seccomp_mode(&tsk->seccomp))
152 return 0;
153 task_pt_regs(tsk)->orig_ax = syscall_nr;
154 task_pt_regs(tsk)->ax = syscall_nr;
155 return __secure_computing(syscall_nr);
156}
157#else
158#define vsyscall_seccomp(_tsk, _nr) 0
159#endif
160
161static bool write_ok_or_segv(unsigned long ptr, size_t size) 148static bool write_ok_or_segv(unsigned long ptr, size_t size)
162{ 149{
163 /* 150 /*
@@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
190{ 177{
191 struct task_struct *tsk; 178 struct task_struct *tsk;
192 unsigned long caller; 179 unsigned long caller;
193 int vsyscall_nr; 180 int vsyscall_nr, syscall_nr, tmp;
194 int prev_sig_on_uaccess_error; 181 int prev_sig_on_uaccess_error;
195 long ret; 182 long ret;
196 int skip;
197 183
198 /* 184 /*
199 * No point in checking CS -- the only way to get here is a user mode 185 * No point in checking CS -- the only way to get here is a user mode
@@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
225 } 211 }
226 212
227 tsk = current; 213 tsk = current;
228 /*
229 * With a real vsyscall, page faults cause SIGSEGV. We want to
230 * preserve that behavior to make writing exploits harder.
231 */
232 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
233 current_thread_info()->sig_on_uaccess_error = 1;
234 214
235 /* 215 /*
216 * Check for access_ok violations and find the syscall nr.
217 *
236 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and 218 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
237 * 64-bit, so we don't need to special-case it here. For all the 219 * 64-bit, so we don't need to special-case it here. For all the
238 * vsyscalls, NULL means "don't write anything" not "write it at 220 * vsyscalls, NULL means "don't write anything" not "write it at
239 * address 0". 221 * address 0".
240 */ 222 */
241 ret = -EFAULT;
242 skip = 0;
243 switch (vsyscall_nr) { 223 switch (vsyscall_nr) {
244 case 0: 224 case 0:
245 skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
246 if (skip)
247 break;
248
249 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || 225 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
250 !write_ok_or_segv(regs->si, sizeof(struct timezone))) 226 !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
251 break; 227 ret = -EFAULT;
228 goto check_fault;
229 }
230
231 syscall_nr = __NR_gettimeofday;
232 break;
233
234 case 1:
235 if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
236 ret = -EFAULT;
237 goto check_fault;
238 }
239
240 syscall_nr = __NR_time;
241 break;
242
243 case 2:
244 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
245 !write_ok_or_segv(regs->si, sizeof(unsigned))) {
246 ret = -EFAULT;
247 goto check_fault;
248 }
249
250 syscall_nr = __NR_getcpu;
251 break;
252 }
253
254 /*
255 * Handle seccomp. regs->ip must be the original value.
256 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
257 *
258 * We could optimize the seccomp disabled case, but performance
259 * here doesn't matter.
260 */
261 regs->orig_ax = syscall_nr;
262 regs->ax = -ENOSYS;
263 tmp = secure_computing(syscall_nr);
264 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
265 warn_bad_vsyscall(KERN_DEBUG, regs,
266 "seccomp tried to change syscall nr or ip");
267 do_exit(SIGSYS);
268 }
269 if (tmp)
270 goto do_ret; /* skip requested */
252 271
272 /*
273 * With a real vsyscall, page faults cause SIGSEGV. We want to
274 * preserve that behavior to make writing exploits harder.
275 */
276 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
277 current_thread_info()->sig_on_uaccess_error = 1;
278
279 ret = -EFAULT;
280 switch (vsyscall_nr) {
281 case 0:
253 ret = sys_gettimeofday( 282 ret = sys_gettimeofday(
254 (struct timeval __user *)regs->di, 283 (struct timeval __user *)regs->di,
255 (struct timezone __user *)regs->si); 284 (struct timezone __user *)regs->si);
256 break; 285 break;
257 286
258 case 1: 287 case 1:
259 skip = vsyscall_seccomp(tsk, __NR_time);
260 if (skip)
261 break;
262
263 if (!write_ok_or_segv(regs->di, sizeof(time_t)))
264 break;
265
266 ret = sys_time((time_t __user *)regs->di); 288 ret = sys_time((time_t __user *)regs->di);
267 break; 289 break;
268 290
269 case 2: 291 case 2:
270 skip = vsyscall_seccomp(tsk, __NR_getcpu);
271 if (skip)
272 break;
273
274 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
275 !write_ok_or_segv(regs->si, sizeof(unsigned)))
276 break;
277
278 ret = sys_getcpu((unsigned __user *)regs->di, 292 ret = sys_getcpu((unsigned __user *)regs->di,
279 (unsigned __user *)regs->si, 293 (unsigned __user *)regs->si,
280 NULL); 294 NULL);
@@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
283 297
284 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; 298 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
285 299
286 if (skip) { 300check_fault:
287 if ((long)regs->ax <= 0L) /* seccomp errno emulation */
288 goto do_ret;
289 goto done; /* seccomp trace/trap */
290 }
291
292 if (ret == -EFAULT) { 301 if (ret == -EFAULT) {
293 /* Bad news -- userspace fed a bad pointer to a vsyscall. */ 302 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
294 warn_bad_vsyscall(KERN_INFO, regs, 303 warn_bad_vsyscall(KERN_INFO, regs,
@@ -311,7 +320,6 @@ do_ret:
311 /* Emulate a ret instruction. */ 320 /* Emulate a ret instruction. */
312 regs->ip = caller; 321 regs->ip = caller;
313 regs->sp += 8; 322 regs->sp += 8;
314done:
315 return true; 323 return true;
316 324
317sigsegv: 325sigsegv:
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1330dd102950..b014d9414d08 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy);
59EXPORT_SYMBOL(__memcpy); 59EXPORT_SYMBOL(__memcpy);
60EXPORT_SYMBOL(memmove); 60EXPORT_SYMBOL(memmove);
61 61
62#ifndef CONFIG_DEBUG_VIRTUAL
63EXPORT_SYMBOL(phys_base);
64#endif
62EXPORT_SYMBOL(empty_zero_page); 65EXPORT_SYMBOL(empty_zero_page);
63#ifndef CONFIG_PARAVIRT 66#ifndef CONFIG_PARAVIRT
64EXPORT_SYMBOL(native_load_gs_index); 67EXPORT_SYMBOL(native_load_gs_index);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 7a3d075a814a..45a14dbbddaf 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -19,6 +19,7 @@
19#include <asm/time.h> 19#include <asm/time.h>
20#include <asm/irq.h> 20#include <asm/irq.h>
21#include <asm/io_apic.h> 21#include <asm/io_apic.h>
22#include <asm/hpet.h>
22#include <asm/pat.h> 23#include <asm/pat.h>
23#include <asm/tsc.h> 24#include <asm/tsc.h>
24#include <asm/iommu.h> 25#include <asm/iommu.h>
@@ -62,10 +63,6 @@ struct x86_init_ops x86_init __initdata = {
62 .banner = default_banner, 63 .banner = default_banner,
63 }, 64 },
64 65
65 .mapping = {
66 .pagetable_reserve = native_pagetable_reserve,
67 },
68
69 .paging = { 66 .paging = {
70 .pagetable_init = native_pagetable_init, 67 .pagetable_init = native_pagetable_init,
71 }, 68 },
@@ -111,15 +108,22 @@ struct x86_platform_ops x86_platform = {
111 108
112EXPORT_SYMBOL_GPL(x86_platform); 109EXPORT_SYMBOL_GPL(x86_platform);
113struct x86_msi_ops x86_msi = { 110struct x86_msi_ops x86_msi = {
114 .setup_msi_irqs = native_setup_msi_irqs, 111 .setup_msi_irqs = native_setup_msi_irqs,
115 .teardown_msi_irq = native_teardown_msi_irq, 112 .compose_msi_msg = native_compose_msi_msg,
116 .teardown_msi_irqs = default_teardown_msi_irqs, 113 .teardown_msi_irq = native_teardown_msi_irq,
117 .restore_msi_irqs = default_restore_msi_irqs, 114 .teardown_msi_irqs = default_teardown_msi_irqs,
115 .restore_msi_irqs = default_restore_msi_irqs,
116 .setup_hpet_msi = default_setup_hpet_msi,
118}; 117};
119 118
120struct x86_io_apic_ops x86_io_apic_ops = { 119struct x86_io_apic_ops x86_io_apic_ops = {
121 .init = native_io_apic_init_mappings, 120 .init = native_io_apic_init_mappings,
122 .read = native_io_apic_read, 121 .read = native_io_apic_read,
123 .write = native_io_apic_write, 122 .write = native_io_apic_write,
124 .modify = native_io_apic_modify, 123 .modify = native_io_apic_modify,
124 .disable = native_disable_io_apic,
125 .print_entries = native_io_apic_print_entries,
126 .set_affinity = native_ioapic_set_affinity,
127 .setup_entry = native_setup_ioapic_entry,
128 .eoi_ioapic_pin = native_eoi_ioapic_pin,
125}; 129};