aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/amd_iommu.c6
-rw-r--r--arch/x86/kernel/amd_iommu_init.c8
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/apic.c22
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/bios_uv.c58
-rw-r--r--arch/x86/kernel/check.c161
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c8
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c8
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c58
-rw-r--r--arch/x86/kernel/cpu/intel.c22
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c17
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c346
-rw-r--r--arch/x86/kernel/cpu/vmware.c112
-rw-r--r--arch/x86/kernel/ds.c899
-rw-r--r--arch/x86/kernel/e820.c16
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/early_printk.c47
-rw-r--r--arch/x86/kernel/entry_32.S477
-rw-r--r--arch/x86/kernel/entry_64.S1417
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c111
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head32.c3
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/hpet.c11
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic.c5
-rw-r--r--arch/x86/kernel/irq_64.c27
-rw-r--r--arch/x86/kernel/irqinit_32.c2
-rw-r--r--arch/x86/kernel/irqinit_64.c66
-rw-r--r--arch/x86/kernel/microcode_amd.c232
-rw-r--r--arch/x86/kernel/microcode_core.c25
-rw-r--r--arch/x86/kernel/microcode_intel.c8
-rw-r--r--arch/x86/kernel/nmi.c15
-rw-r--r--arch/x86/kernel/pci-dma.c24
-rw-r--r--arch/x86/kernel/pci-gart_64.c4
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c29
-rw-r--r--arch/x86/kernel/process.c3
-rw-r--r--arch/x86/kernel/process_32.c63
-rw-r--r--arch/x86/kernel/process_64.c54
-rw-r--r--arch/x86/kernel/ptrace.c464
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/setup.c162
-rw-r--r--arch/x86/kernel/setup_percpu.c8
-rw-r--r--arch/x86/kernel/sigframe.h42
-rw-r--r--arch/x86/kernel/signal.c (renamed from arch/x86/kernel/signal_32.c)567
-rw-r--r--arch/x86/kernel/signal_64.c516
-rw-r--r--arch/x86/kernel/smp.c18
-rw-r--r--arch/x86/kernel/smpboot.c19
-rw-r--r--arch/x86/kernel/time_32.c2
-rw-r--r--arch/x86/kernel/time_64.c6
-rw-r--r--arch/x86/kernel/tlb_32.c13
-rw-r--r--arch/x86/kernel/tlb_64.c2
-rw-r--r--arch/x86/kernel/tlb_uv.c4
-rw-r--r--arch/x86/kernel/trampoline.c19
-rw-r--r--arch/x86/kernel/traps.c38
-rw-r--r--arch/x86/kernel/tsc.c42
-rw-r--r--arch/x86/kernel/tsc_sync.c8
-rw-r--r--arch/x86/kernel/vmi_32.c119
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S1
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S1
-rw-r--r--arch/x86/kernel/vsyscall_64.c9
69 files changed, 3151 insertions, 3322 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 1cad9318d217..d364df03c1d6 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -12,6 +12,7 @@ CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_ftrace.o = -pg 14CFLAGS_REMOVE_ftrace.o = -pg
15CFLAGS_REMOVE_early_printk.o = -pg
15endif 16endif
16 17
17# 18#
@@ -23,7 +24,7 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
23CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
24CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
25 26
26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 27obj-y := process_$(BITS).o signal.o entry_$(BITS).o
27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
28obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 29obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 30obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
@@ -106,6 +107,10 @@ microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
106microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 107microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
107obj-$(CONFIG_MICROCODE) += microcode.o 108obj-$(CONFIG_MICROCODE) += microcode.o
108 109
110obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
111
112obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
113
109### 114###
110# 64 bit specific files 115# 64 bit specific files
111ifeq ($(CONFIG_X86_64),y) 116ifeq ($(CONFIG_X86_64),y)
@@ -119,7 +124,6 @@ ifeq ($(CONFIG_X86_64),y)
119 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 124 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
120 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
121 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
122 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
123 127
124 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o 128 obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
125endif 129endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a7b6dec6fc3f..2e2da717b350 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -24,6 +24,7 @@
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/iommu.h> 26#include <asm/iommu.h>
27#include <asm/gart.h>
27#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
29 30
@@ -235,8 +236,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
235 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 236 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
236 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 237 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
237 238
238 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 239 if (unlikely(i == EXIT_LOOP_COUNT))
239 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 240 panic("AMD IOMMU: Completion wait loop failed\n");
241
240out: 242out:
241 spin_unlock_irqrestore(&iommu->lock, flags); 243 spin_unlock_irqrestore(&iommu->lock, flags);
242 244
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 30ae2701b3df..c625800c55ca 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -28,6 +28,7 @@
28#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 30#include <asm/iommu.h>
31#include <asm/gart.h>
31 32
32/* 33/*
33 * definitions for the ACPI scanning code 34 * definitions for the ACPI scanning code
@@ -427,6 +428,10 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
427 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 428 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
428 &entry, sizeof(entry)); 429 &entry, sizeof(entry));
429 430
431 /* set head and tail to zero manually */
432 writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
433 writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
434
430 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); 435 iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
431 436
432 return cmd_buf; 437 return cmd_buf;
@@ -1074,7 +1079,8 @@ int __init amd_iommu_init(void)
1074 goto free; 1079 goto free;
1075 1080
1076 /* IOMMU rlookup table - find the IOMMU for a specific device */ 1081 /* IOMMU rlookup table - find the IOMMU for a specific device */
1077 amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, 1082 amd_iommu_rlookup_table = (void *)__get_free_pages(
1083 GFP_KERNEL | __GFP_ZERO,
1078 get_order(rlookup_table_size)); 1084 get_order(rlookup_table_size));
1079 if (amd_iommu_rlookup_table == NULL) 1085 if (amd_iommu_rlookup_table == NULL)
1080 goto free; 1086 goto free;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9a32b37ee2ee..676debfc1702 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,8 +1,9 @@
1/* 1/*
2 * Firmware replacement code. 2 * Firmware replacement code.
3 * 3 *
4 * Work around broken BIOSes that don't set an aperture or only set the 4 * Work around broken BIOSes that don't set an aperture, only set the
5 * aperture in the AGP bridge. 5 * aperture in the AGP bridge, or set too small aperture.
6 *
6 * If all fails map the aperture over some low memory. This is cheaper than 7 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot 8 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB. 9 * because only the bootmem allocator can allocate 32+MB.
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index b9019271af62..6b7f824db160 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h> 32#include <linux/dmar.h>
33#include <linux/ftrace.h>
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35#include <asm/smp.h> 36#include <asm/smp.h>
@@ -775,11 +776,7 @@ static void local_apic_timer_interrupt(void)
775 /* 776 /*
776 * the NMI deadlock-detector uses this. 777 * the NMI deadlock-detector uses this.
777 */ 778 */
778#ifdef CONFIG_X86_64 779 inc_irq_stat(apic_timer_irqs);
779 add_pda(apic_timer_irqs, 1);
780#else
781 per_cpu(irq_stat, cpu).apic_timer_irqs++;
782#endif
783 780
784 evt->event_handler(evt); 781 evt->event_handler(evt);
785} 782}
@@ -792,7 +789,7 @@ static void local_apic_timer_interrupt(void)
792 * [ if a single-CPU system runs an SMP kernel then we call the local 789 * [ if a single-CPU system runs an SMP kernel then we call the local
793 * interrupt as well. Thus we cannot inline the local irq ... ] 790 * interrupt as well. Thus we cannot inline the local irq ... ]
794 */ 791 */
795void smp_apic_timer_interrupt(struct pt_regs *regs) 792void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
796{ 793{
797 struct pt_regs *old_regs = set_irq_regs(regs); 794 struct pt_regs *old_regs = set_irq_regs(regs);
798 795
@@ -806,9 +803,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
806 * Besides, if we don't timer interrupts ignore the global 803 * Besides, if we don't timer interrupts ignore the global
807 * interrupt lock, which is the WrongThing (tm) to do. 804 * interrupt lock, which is the WrongThing (tm) to do.
808 */ 805 */
809#ifdef CONFIG_X86_64
810 exit_idle(); 806 exit_idle();
811#endif
812 irq_enter(); 807 irq_enter();
813 local_apic_timer_interrupt(); 808 local_apic_timer_interrupt();
814 irq_exit(); 809 irq_exit();
@@ -1666,9 +1661,7 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1666{ 1661{
1667 u32 v; 1662 u32 v;
1668 1663
1669#ifdef CONFIG_X86_64
1670 exit_idle(); 1664 exit_idle();
1671#endif
1672 irq_enter(); 1665 irq_enter();
1673 /* 1666 /*
1674 * Check if this really is a spurious interrupt and ACK it 1667 * Check if this really is a spurious interrupt and ACK it
@@ -1679,14 +1672,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1679 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1672 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1680 ack_APIC_irq(); 1673 ack_APIC_irq();
1681 1674
1682#ifdef CONFIG_X86_64 1675 inc_irq_stat(irq_spurious_count);
1683 add_pda(irq_spurious_count, 1); 1676
1684#else
1685 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1677 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1686 pr_info("spurious APIC interrupt on CPU#%d, " 1678 pr_info("spurious APIC interrupt on CPU#%d, "
1687 "should never happen.\n", smp_processor_id()); 1679 "should never happen.\n", smp_processor_id());
1688 __get_cpu_var(irq_stat).irq_spurious_count++;
1689#endif
1690 irq_exit(); 1680 irq_exit();
1691} 1681}
1692 1682
@@ -1697,9 +1687,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1697{ 1687{
1698 u32 v, v1; 1688 u32 v, v1;
1699 1689
1700#ifdef CONFIG_X86_64
1701 exit_idle(); 1690 exit_idle();
1702#endif
1703 irq_enter(); 1691 irq_enter();
1704 /* First tickle the hardware, only then report what went on. -- REW */ 1692 /* First tickle the hardware, only then report what went on. -- REW */
1705 v = apic_read(APIC_ESR); 1693 v = apic_read(APIC_ESR);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6649d09ad88f..ee4df08feee6 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -11,7 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <linux/kbuild.h> 12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 13#include <asm/ucontext.h>
14#include "sigframe.h" 14#include <asm/sigframe.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/processor.h> 17#include <asm/processor.h>
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d22f8b..1d41d3f1edbc 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,6 +20,8 @@
20 20
21#include <xen/interface/xen.h> 21#include <xen/interface/xen.h>
22 22
23#include <asm/sigframe.h>
24
23#define __NO_STUBS 1 25#define __NO_STUBS 1
24#undef __SYSCALL 26#undef __SYSCALL
25#undef _ASM_X86_UNISTD_64_H 27#undef _ASM_X86_UNISTD_64_H
@@ -87,7 +89,7 @@ int main(void)
87 BLANK(); 89 BLANK();
88#undef ENTRY 90#undef ENTRY
89 DEFINE(IA32_RT_SIGFRAME_sigcontext, 91 DEFINE(IA32_RT_SIGFRAME_sigcontext,
90 offsetof (struct rt_sigframe32, uc.uc_mcontext)); 92 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext));
91 BLANK(); 93 BLANK();
92#endif 94#endif
93 DEFINE(pbe_address, offsetof(struct pbe, address)); 95 DEFINE(pbe_address, offsetof(struct pbe, address));
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index f0dfe6f17e7e..2a0a2a3cac26 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -69,10 +69,10 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
69 69
70long sn_partition_id; 70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id); 71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id; 72long sn_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id); 73EXPORT_SYMBOL_GPL(sn_coherency_id);
74long uv_region_size; 74long sn_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size); 75EXPORT_SYMBOL_GPL(sn_region_size);
76int uv_type; 76int uv_type;
77 77
78 78
@@ -100,6 +100,56 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
100 return ret; 100 return ret;
101} 101}
102 102
103int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset)
106{
107 union uv_watchlist_u size_blade;
108 u64 watchlist;
109 s64 ret;
110
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /*
115 * bios returns watchlist number or negative error number.
116 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS)
121 return ret;
122
123 return watchlist;
124}
125EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
126
127int
128uv_bios_mq_watchlist_free(int blade, int watchlist_num)
129{
130 return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
131 blade, watchlist_num, 0, 0, 0);
132}
133EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
134
135s64
136uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
137{
138 return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
139 perms, 0, 0);
140}
141EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
142
143s64
144uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
145{
146 s64 ret;
147
148 ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
149 (u64)addr, buf, (u64)len, 0);
150 return ret;
151}
152EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
103 153
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) 154s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{ 155{
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
new file mode 100644
index 000000000000..2ac0ab71412a
--- /dev/null
+++ b/arch/x86/kernel/check.c
@@ -0,0 +1,161 @@
1#include <linux/module.h>
2#include <linux/sched.h>
3#include <linux/kthread.h>
4#include <linux/workqueue.h>
5#include <asm/e820.h>
6#include <asm/proto.h>
7
8/*
9 * Some BIOSes seem to corrupt the low 64k of memory during events
10 * like suspend/resume and unplugging an HDMI cable. Reserve all
11 * remaining free memory in that area and fill it with a distinct
12 * pattern.
13 */
14#define MAX_SCAN_AREAS 8
15
16static int __read_mostly memory_corruption_check = -1;
17
18static unsigned __read_mostly corruption_check_size = 64*1024;
19static unsigned __read_mostly corruption_check_period = 60; /* seconds */
20
21static struct e820entry scan_areas[MAX_SCAN_AREAS];
22static int num_scan_areas;
23
24
25static __init int set_corruption_check(char *arg)
26{
27 char *end;
28
29 memory_corruption_check = simple_strtol(arg, &end, 10);
30
31 return (*end == 0) ? 0 : -EINVAL;
32}
33early_param("memory_corruption_check", set_corruption_check);
34
35static __init int set_corruption_check_period(char *arg)
36{
37 char *end;
38
39 corruption_check_period = simple_strtoul(arg, &end, 10);
40
41 return (*end == 0) ? 0 : -EINVAL;
42}
43early_param("memory_corruption_check_period", set_corruption_check_period);
44
45static __init int set_corruption_check_size(char *arg)
46{
47 char *end;
48 unsigned size;
49
50 size = memparse(arg, &end);
51
52 if (*end == '\0')
53 corruption_check_size = size;
54
55 return (size == corruption_check_size) ? 0 : -EINVAL;
56}
57early_param("memory_corruption_check_size", set_corruption_check_size);
58
59
60void __init setup_bios_corruption_check(void)
61{
62 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
63
64 if (memory_corruption_check == -1) {
65 memory_corruption_check =
66#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
67 1
68#else
69 0
70#endif
71 ;
72 }
73
74 if (corruption_check_size == 0)
75 memory_corruption_check = 0;
76
77 if (!memory_corruption_check)
78 return;
79
80 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
81
82 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
83 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
85
86 if (addr == 0)
87 break;
88
89 if ((addr + size) > corruption_check_size)
90 size = corruption_check_size - addr;
91
92 if (size == 0)
93 break;
94
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
96 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size;
98 num_scan_areas++;
99
100 /* Assume we've already mapped this early memory */
101 memset(__va(addr), 0, size);
102
103 addr += size;
104 }
105
106 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
107 num_scan_areas);
108 update_e820();
109}
110
111
112void check_for_bios_corruption(void)
113{
114 int i;
115 int corruption = 0;
116
117 if (!memory_corruption_check)
118 return;
119
120 for (i = 0; i < num_scan_areas; i++) {
121 unsigned long *addr = __va(scan_areas[i].addr);
122 unsigned long size = scan_areas[i].size;
123
124 for (; size; addr++, size -= sizeof(unsigned long)) {
125 if (!*addr)
126 continue;
127 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
128 addr, __pa(addr), *addr);
129 corruption = 1;
130 *addr = 0;
131 }
132 }
133
134 WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n");
135}
136
137static void check_corruption(struct work_struct *dummy);
138static DECLARE_DELAYED_WORK(bios_check_work, check_corruption);
139
140static void check_corruption(struct work_struct *dummy)
141{
142 check_for_bios_corruption();
143 schedule_delayed_work(&bios_check_work,
144 round_jiffies_relative(corruption_check_period*HZ));
145}
146
147static int start_periodic_check_for_corruption(void)
148{
149 if (!memory_corruption_check || corruption_check_period == 0)
150 return 0;
151
152 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
153 corruption_check_period);
154
155 /* First time we run the checks right away */
156 schedule_delayed_work(&bios_check_work, 0);
157 return 0;
158}
159
160module_init(start_periodic_check_for_corruption);
161
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c057..82db7f45e2de 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -2,8 +2,14 @@
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg
8endif
9
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 10obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o capflags.o powerflags.o common.o 11obj-y += proc.o capflags.o powerflags.o common.o
12obj-y += vmware.o hypervisor.o
7 13
8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 14obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += bugs_64.o 15obj-$(CONFIG_X86_64) += bugs_64.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index ef8f831af823..2cf23634b6d9 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -120,9 +120,17 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) 120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask; 121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); 122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123 /*
124 * Reinit the apicid, now that we have extended initial_apicid.
125 */
126 c->apicid = phys_pkg_id(c->initial_apicid, 0);
123#else 127#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; 128 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width); 129 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
130 /*
131 * Reinit the apicid, now that we have extended initial_apicid.
132 */
133 c->apicid = phys_pkg_id(0);
126#endif 134#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 135 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128 136
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8f1e31db2ad5..7c878f6aa919 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -283,9 +283,14 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{ 283{
284 early_init_amd_mc(c); 284 early_init_amd_mc(c);
285 285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ 286 /*
287 if (c->x86_power & (1<<8)) 287 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
288 * with P/T states and does not stop in deep C-states
289 */
290 if (c->x86_power & (1 << 8)) {
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 291 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
292 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
293 }
289 294
290#ifdef CONFIG_X86_64 295#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32); 296 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a9..42e0853030cb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/sections.h> 37#include <asm/sections.h>
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/hypervisor.h>
39 40
40#include "cpu.h" 41#include "cpu.h"
41 42
@@ -703,6 +704,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
703 detect_ht(c); 704 detect_ht(c);
704#endif 705#endif
705 706
707 init_hypervisor(c);
706 /* 708 /*
707 * On SMP, boot_cpu_data holds the common feature set between 709 * On SMP, boot_cpu_data holds the common feature set between
708 * all CPUs; so make sure that we indicate which features are 710 * all CPUs; so make sure that we indicate which features are
@@ -862,7 +864,7 @@ EXPORT_SYMBOL(_cpu_pda);
862 864
863struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 865struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
864 866
865char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 867static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
866 868
867void __cpuinit pda_init(int cpu) 869void __cpuinit pda_init(int cpu)
868{ 870{
@@ -903,8 +905,8 @@ void __cpuinit pda_init(int cpu)
903 } 905 }
904} 906}
905 907
906char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 908static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
907 DEBUG_STKSZ] __page_aligned_bss; 909 DEBUG_STKSZ] __page_aligned_bss;
908 910
909extern asmlinkage void ignore_sysret(void); 911extern asmlinkage void ignore_sysret(void);
910 912
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
new file mode 100644
index 000000000000..fb5b86af0b01
--- /dev/null
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -0,0 +1,58 @@
1/*
2 * Common hypervisor code
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <asm/processor.h>
25#include <asm/vmware.h>
26#include <asm/hypervisor.h>
27
28static inline void __cpuinit
29detect_hypervisor_vendor(struct cpuinfo_x86 *c)
30{
31 if (vmware_platform()) {
32 c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
33 } else {
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35 }
36}
37
38unsigned long get_hypervisor_tsc_freq(void)
39{
40 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
41 return vmware_get_tsc_khz();
42 return 0;
43}
44
45static inline void __cpuinit
46hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
47{
48 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
49 vmware_set_feature_bits(c);
50 return;
51 }
52}
53
54void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
55{
56 detect_hypervisor_vendor(c);
57 hypervisor_set_feature_bits(c);
58}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 816f27f289b1..8ea6929e974c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,7 +11,6 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h> 14#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17 16
@@ -41,6 +40,16 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
41 if (c->x86 == 15 && c->x86_cache_alignment == 64) 40 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128; 41 c->x86_cache_alignment = 128;
43#endif 42#endif
43
44 /*
45 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate
46 * with P/T states and does not stop in deep C-states
47 */
48 if (c->x86_power & (1 << 8)) {
49 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
50 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
51 }
52
44} 53}
45 54
46#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
@@ -242,6 +251,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
242 251
243 intel_workarounds(c); 252 intel_workarounds(c);
244 253
254 /*
255 * Detect the extended topology information if available. This
256 * will reinitialise the initial_apicid which will be used
257 * in init_intel_cacheinfo()
258 */
259 detect_extended_topology(c);
260
245 l2 = init_intel_cacheinfo(c); 261 l2 = init_intel_cacheinfo(c);
246 if (c->cpuid_level > 9) { 262 if (c->cpuid_level > 9) {
247 unsigned eax = cpuid_eax(10); 263 unsigned eax = cpuid_eax(10);
@@ -309,10 +325,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
309 set_cpu_cap(c, X86_FEATURE_P3); 325 set_cpu_cap(c, X86_FEATURE_P3);
310#endif 326#endif
311 327
312 if (cpu_has_bts)
313 ptrace_bts_init_intel(c);
314
315 detect_extended_topology(c);
316 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 328 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
317 /* 329 /*
318 * let's use the legacy cpuid vector 0x1 and 0x4 for topology 330 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7bd00a565672..48533d77be78 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -641,20 +641,17 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
641 return show_shared_cpu_map_func(leaf, 1, buf); 641 return show_shared_cpu_map_func(leaf, 1, buf);
642} 642}
643 643
644static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { 644static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
645 switch(this_leaf->eax.split.type) { 645{
646 case CACHE_TYPE_DATA: 646 switch (this_leaf->eax.split.type) {
647 case CACHE_TYPE_DATA:
647 return sprintf(buf, "Data\n"); 648 return sprintf(buf, "Data\n");
648 break; 649 case CACHE_TYPE_INST:
649 case CACHE_TYPE_INST:
650 return sprintf(buf, "Instruction\n"); 650 return sprintf(buf, "Instruction\n");
651 break; 651 case CACHE_TYPE_UNIFIED:
652 case CACHE_TYPE_UNIFIED:
653 return sprintf(buf, "Unified\n"); 652 return sprintf(buf, "Unified\n");
654 break; 653 default:
655 default:
656 return sprintf(buf, "Unknown\n"); 654 return sprintf(buf, "Unknown\n");
657 break;
658 } 655 }
659} 656}
660 657
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b031a4ac856..1c838032fd37 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -510,12 +510,9 @@ static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
510 */ 510 */
511void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 511void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
512{ 512{
513 static cpumask_t mce_cpus = CPU_MASK_NONE;
514
515 mce_cpu_quirks(c); 513 mce_cpu_quirks(c);
516 514
517 if (mce_dont_init || 515 if (mce_dont_init ||
518 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
519 !mce_available(c)) 516 !mce_available(c))
520 return; 517 return;
521 518
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index a1de80f368f1..a5a5e0530370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -248,7 +248,7 @@ asmlinkage void mce_threshold_interrupt(void)
248 } 248 }
249 } 249 }
250out: 250out:
251 add_pda(irq_threshold_count, 1); 251 inc_irq_stat(irq_threshold_count);
252 irq_exit(); 252 irq_exit();
253} 253}
254 254
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index c17eaf5dd6dd..4b48f251fd39 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -26,7 +26,7 @@ asmlinkage void smp_thermal_interrupt(void)
26 if (therm_throt_process(msr_val & 1)) 26 if (therm_throt_process(msr_val & 1))
27 mce_log_therm_throt_event(smp_processor_id(), msr_val); 27 mce_log_therm_throt_event(smp_processor_id(), msr_val);
28 28
29 add_pda(irq_thermal_count, 1); 29 inc_irq_stat(irq_thermal_count);
30 irq_exit(); 30 irq_exit();
31} 31}
32 32
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index c78c04821ea1..1159e269e596 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -803,6 +803,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
803} 803}
804 804
805static struct res_range __initdata range[RANGE_NUM]; 805static struct res_range __initdata range[RANGE_NUM];
806static int __initdata nr_range;
806 807
807#ifdef CONFIG_MTRR_SANITIZER 808#ifdef CONFIG_MTRR_SANITIZER
808 809
@@ -1206,39 +1207,43 @@ struct mtrr_cleanup_result {
1206#define PSHIFT (PAGE_SHIFT - 10) 1207#define PSHIFT (PAGE_SHIFT - 10)
1207 1208
1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1209static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
1209static struct res_range __initdata range_new[RANGE_NUM];
1210static unsigned long __initdata min_loss_pfn[RANGE_NUM]; 1210static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1211 1211
1212static int __init mtrr_cleanup(unsigned address_bits) 1212static void __init print_out_mtrr_range_state(void)
1213{ 1213{
1214 unsigned long extra_remove_base, extra_remove_size;
1215 unsigned long base, size, def, dummy;
1216 mtrr_type type;
1217 int nr_range, nr_range_new;
1218 u64 chunk_size, gran_size;
1219 unsigned long range_sums, range_sums_new;
1220 int index_good;
1221 int num_reg_good;
1222 int i; 1214 int i;
1215 char start_factor = 'K', size_factor = 'K';
1216 unsigned long start_base, size_base;
1217 mtrr_type type;
1223 1218
1224 /* extra one for all 0 */ 1219 for (i = 0; i < num_var_ranges; i++) {
1225 int num[MTRR_NUM_TYPES + 1];
1226 1220
1227 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) 1221 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1228 return 0; 1222 if (!size_base)
1229 rdmsr(MTRRdefType_MSR, def, dummy); 1223 continue;
1230 def &= 0xff;
1231 if (def != MTRR_TYPE_UNCACHABLE)
1232 return 0;
1233 1224
1234 /* get it and store it aside */ 1225 size_base = to_size_factor(size_base, &size_factor),
1235 memset(range_state, 0, sizeof(range_state)); 1226 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1236 for (i = 0; i < num_var_ranges; i++) { 1227 start_base = to_size_factor(start_base, &start_factor),
1237 mtrr_if->get(i, &base, &size, &type); 1228 type = range_state[i].type;
1238 range_state[i].base_pfn = base; 1229
1239 range_state[i].size_pfn = size; 1230 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1240 range_state[i].type = type; 1231 i, start_base, start_factor,
1232 size_base, size_factor,
1233 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1234 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1235 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1236 );
1241 } 1237 }
1238}
1239
1240static int __init mtrr_need_cleanup(void)
1241{
1242 int i;
1243 mtrr_type type;
1244 unsigned long size;
1245 /* extra one for all 0 */
1246 int num[MTRR_NUM_TYPES + 1];
1242 1247
1243 /* check entries number */ 1248 /* check entries number */
1244 memset(num, 0, sizeof(num)); 1249 memset(num, 0, sizeof(num));
@@ -1263,29 +1268,133 @@ static int __init mtrr_cleanup(unsigned address_bits)
1263 num_var_ranges - num[MTRR_NUM_TYPES]) 1268 num_var_ranges - num[MTRR_NUM_TYPES])
1264 return 0; 1269 return 0;
1265 1270
1266 /* print original var MTRRs at first, for debugging: */ 1271 return 1;
1267 printk(KERN_DEBUG "original variable MTRRs\n"); 1272}
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271 1273
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); 1274static unsigned long __initdata range_sums;
1273 if (!size_base) 1275static void __init mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
1274 continue; 1276 unsigned long extra_remove_base,
1277 unsigned long extra_remove_size,
1278 int i)
1279{
1280 int num_reg;
1281 static struct res_range range_new[RANGE_NUM];
1282 static int nr_range_new;
1283 unsigned long range_sums_new;
1284
1285 /* convert ranges to var ranges state */
1286 num_reg = x86_setup_var_mtrrs(range, nr_range,
1287 chunk_size, gran_size);
1288
1289 /* we got new setting in range_state, check it */
1290 memset(range_new, 0, sizeof(range_new));
1291 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1292 extra_remove_base, extra_remove_size);
1293 range_sums_new = sum_ranges(range_new, nr_range_new);
1294
1295 result[i].chunk_sizek = chunk_size >> 10;
1296 result[i].gran_sizek = gran_size >> 10;
1297 result[i].num_reg = num_reg;
1298 if (range_sums < range_sums_new) {
1299 result[i].lose_cover_sizek =
1300 (range_sums_new - range_sums) << PSHIFT;
1301 result[i].bad = 1;
1302 } else
1303 result[i].lose_cover_sizek =
1304 (range_sums - range_sums_new) << PSHIFT;
1275 1305
1276 size_base = to_size_factor(size_base, &size_factor), 1306 /* double check it */
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); 1307 if (!result[i].bad && !result[i].lose_cover_sizek) {
1278 start_base = to_size_factor(start_base, &start_factor), 1308 if (nr_range_new != nr_range ||
1279 type = range_state[i].type; 1309 memcmp(range, range_new, sizeof(range)))
1310 result[i].bad = 1;
1311 }
1280 1312
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", 1313 if (!result[i].bad && (range_sums - range_sums_new <
1282 i, start_base, start_factor, 1314 min_loss_pfn[num_reg])) {
1283 size_base, size_factor, 1315 min_loss_pfn[num_reg] =
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" : 1316 range_sums - range_sums_new;
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 } 1317 }
1318}
1319
1320static void __init mtrr_print_out_one_result(int i)
1321{
1322 char gran_factor, chunk_factor, lose_factor;
1323 unsigned long gran_base, chunk_base, lose_base;
1324
1325 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1326 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1327 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1328 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1329 result[i].bad ? "*BAD*" : " ",
1330 gran_base, gran_factor, chunk_base, chunk_factor);
1331 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1332 result[i].num_reg, result[i].bad ? "-" : "",
1333 lose_base, lose_factor);
1334}
1335
1336static int __init mtrr_search_optimal_index(void)
1337{
1338 int i;
1339 int num_reg_good;
1340 int index_good;
1341
1342 if (nr_mtrr_spare_reg >= num_var_ranges)
1343 nr_mtrr_spare_reg = num_var_ranges - 1;
1344 num_reg_good = -1;
1345 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1346 if (!min_loss_pfn[i])
1347 num_reg_good = i;
1348 }
1349
1350 index_good = -1;
1351 if (num_reg_good != -1) {
1352 for (i = 0; i < NUM_RESULT; i++) {
1353 if (!result[i].bad &&
1354 result[i].num_reg == num_reg_good &&
1355 !result[i].lose_cover_sizek) {
1356 index_good = i;
1357 break;
1358 }
1359 }
1360 }
1361
1362 return index_good;
1363}
1364
1365
1366static int __init mtrr_cleanup(unsigned address_bits)
1367{
1368 unsigned long extra_remove_base, extra_remove_size;
1369 unsigned long base, size, def, dummy;
1370 mtrr_type type;
1371 u64 chunk_size, gran_size;
1372 int index_good;
1373 int i;
1374
1375 if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
1376 return 0;
1377 rdmsr(MTRRdefType_MSR, def, dummy);
1378 def &= 0xff;
1379 if (def != MTRR_TYPE_UNCACHABLE)
1380 return 0;
1381
1382 /* get it and store it aside */
1383 memset(range_state, 0, sizeof(range_state));
1384 for (i = 0; i < num_var_ranges; i++) {
1385 mtrr_if->get(i, &base, &size, &type);
1386 range_state[i].base_pfn = base;
1387 range_state[i].size_pfn = size;
1388 range_state[i].type = type;
1389 }
1390
1391 /* check if we need handle it and can handle it */
1392 if (!mtrr_need_cleanup())
1393 return 0;
1394
1395 /* print original var MTRRs at first, for debugging: */
1396 printk(KERN_DEBUG "original variable MTRRs\n");
1397 print_out_mtrr_range_state();
1289 1398
1290 memset(range, 0, sizeof(range)); 1399 memset(range, 0, sizeof(range));
1291 extra_remove_size = 0; 1400 extra_remove_size = 0;
@@ -1309,176 +1418,64 @@ static int __init mtrr_cleanup(unsigned address_bits)
1309 range_sums >> (20 - PAGE_SHIFT)); 1418 range_sums >> (20 - PAGE_SHIFT));
1310 1419
1311 if (mtrr_chunk_size && mtrr_gran_size) { 1420 if (mtrr_chunk_size && mtrr_gran_size) {
1312 int num_reg; 1421 i = 0;
1313 char gran_factor, chunk_factor, lose_factor; 1422 mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size,
1314 unsigned long gran_base, chunk_base, lose_base; 1423 extra_remove_base, extra_remove_size, i);
1315
1316 debug_print++;
1317 /* convert ranges to var ranges state */
1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1319 mtrr_gran_size);
1320 1424
1321 /* we got new setting in range_state, check it */ 1425 mtrr_print_out_one_result(i);
1322 memset(range_new, 0, sizeof(range_new));
1323 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1324 extra_remove_base,
1325 extra_remove_size);
1326 range_sums_new = sum_ranges(range_new, nr_range_new);
1327 1426
1328 i = 0;
1329 result[i].chunk_sizek = mtrr_chunk_size >> 10;
1330 result[i].gran_sizek = mtrr_gran_size >> 10;
1331 result[i].num_reg = num_reg;
1332 if (range_sums < range_sums_new) {
1333 result[i].lose_cover_sizek =
1334 (range_sums_new - range_sums) << PSHIFT;
1335 result[i].bad = 1;
1336 } else
1337 result[i].lose_cover_sizek =
1338 (range_sums - range_sums_new) << PSHIFT;
1339
1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1347 result[i].num_reg, result[i].bad?"-":"",
1348 lose_base, lose_factor);
1349 if (!result[i].bad) { 1427 if (!result[i].bad) {
1350 set_var_mtrr_all(address_bits); 1428 set_var_mtrr_all(address_bits);
1351 return 1; 1429 return 1;
1352 } 1430 }
1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1431 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1354 "will find optimal one\n"); 1432 "will find optimal one\n");
1355 debug_print--;
1356 memset(result, 0, sizeof(result[0]));
1357 } 1433 }
1358 1434
1359 i = 0; 1435 i = 0;
1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1436 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1361 memset(result, 0, sizeof(result)); 1437 memset(result, 0, sizeof(result));
1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { 1438 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368 1439
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32); 1440 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1370 chunk_size <<= 1) { 1441 chunk_size <<= 1) {
1371 int num_reg;
1372 1442
1373 if (debug_print) {
1374 char chunk_factor;
1375 unsigned long chunk_base;
1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1382 if (i >= NUM_RESULT) 1443 if (i >= NUM_RESULT)
1383 continue; 1444 continue;
1384 1445
1385 /* convert ranges to var ranges state */ 1446 mtrr_calc_range_state(chunk_size, gran_size,
1386 num_reg = x86_setup_var_mtrrs(range, nr_range, 1447 extra_remove_base, extra_remove_size, i);
1387 chunk_size, gran_size); 1448 if (debug_print) {
1388 1449 mtrr_print_out_one_result(i);
1389 /* we got new setting in range_state, check it */ 1450 printk(KERN_INFO "\n");
1390 memset(range_new, 0, sizeof(range_new));
1391 nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
1392 extra_remove_base, extra_remove_size);
1393 range_sums_new = sum_ranges(range_new, nr_range_new);
1394
1395 result[i].chunk_sizek = chunk_size >> 10;
1396 result[i].gran_sizek = gran_size >> 10;
1397 result[i].num_reg = num_reg;
1398 if (range_sums < range_sums_new) {
1399 result[i].lose_cover_sizek =
1400 (range_sums_new - range_sums) << PSHIFT;
1401 result[i].bad = 1;
1402 } else
1403 result[i].lose_cover_sizek =
1404 (range_sums - range_sums_new) << PSHIFT;
1405
1406 /* double check it */
1407 if (!result[i].bad && !result[i].lose_cover_sizek) {
1408 if (nr_range_new != nr_range ||
1409 memcmp(range, range_new, sizeof(range)))
1410 result[i].bad = 1;
1411 } 1451 }
1412 1452
1413 if (!result[i].bad && (range_sums - range_sums_new <
1414 min_loss_pfn[num_reg])) {
1415 min_loss_pfn[num_reg] =
1416 range_sums - range_sums_new;
1417 }
1418 i++; 1453 i++;
1419 } 1454 }
1420 } 1455 }
1421 1456
1422 /* print out all */
1423 for (i = 0; i < NUM_RESULT; i++) {
1424 char gran_factor, chunk_factor, lose_factor;
1425 unsigned long gran_base, chunk_base, lose_base;
1426
1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1436 }
1437
1438 /* try to find the optimal index */ 1457 /* try to find the optimal index */
1439 if (nr_mtrr_spare_reg >= num_var_ranges) 1458 index_good = mtrr_search_optimal_index();
1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1441 num_reg_good = -1;
1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1443 if (!min_loss_pfn[i])
1444 num_reg_good = i;
1445 }
1446
1447 index_good = -1;
1448 if (num_reg_good != -1) {
1449 for (i = 0; i < NUM_RESULT; i++) {
1450 if (!result[i].bad &&
1451 result[i].num_reg == num_reg_good &&
1452 !result[i].lose_cover_sizek) {
1453 index_good = i;
1454 break;
1455 }
1456 }
1457 }
1458 1459
1459 if (index_good != -1) { 1460 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1461 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1464 i = index_good; 1462 i = index_good;
1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), 1463 mtrr_print_out_one_result(i);
1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), 1464
1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1469 gran_base, gran_factor, chunk_base, chunk_factor);
1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1472 /* convert ranges to var ranges state */ 1465 /* convert ranges to var ranges state */
1473 chunk_size = result[i].chunk_sizek; 1466 chunk_size = result[i].chunk_sizek;
1474 chunk_size <<= 10; 1467 chunk_size <<= 10;
1475 gran_size = result[i].gran_sizek; 1468 gran_size = result[i].gran_sizek;
1476 gran_size <<= 10; 1469 gran_size <<= 10;
1477 debug_print++;
1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1470 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1480 set_var_mtrr_all(address_bits); 1471 set_var_mtrr_all(address_bits);
1472 printk(KERN_DEBUG "New variable MTRRs\n");
1473 print_out_mtrr_range_state();
1481 return 1; 1474 return 1;
1475 } else {
1476 /* print out all */
1477 for (i = 0; i < NUM_RESULT; i++)
1478 mtrr_print_out_one_result(i);
1482 } 1479 }
1483 1480
1484 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); 1481 printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
@@ -1562,7 +1559,6 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1562{ 1559{
1563 unsigned long i, base, size, highest_pfn = 0, def, dummy; 1560 unsigned long i, base, size, highest_pfn = 0, def, dummy;
1564 mtrr_type type; 1561 mtrr_type type;
1565 int nr_range;
1566 u64 total_trim_size; 1562 u64 total_trim_size;
1567 1563
1568 /* extra one for all 0 */ 1564 /* extra one for all 0 */
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
new file mode 100644
index 000000000000..284c399e3234
--- /dev/null
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -0,0 +1,112 @@
1/*
2 * VMware Detection code.
3 *
4 * Copyright (C) 2008, VMware, Inc.
5 * Author : Alok N Kataria <akataria@vmware.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
15 * NON INFRINGEMENT. See the GNU General Public License for more
16 * details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 */
23
24#include <linux/dmi.h>
25#include <asm/div64.h>
26#include <asm/vmware.h>
27
28#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
30#define VMWARE_HYPERVISOR_PORT 0x5658
31
32#define VMWARE_PORT_CMD_GETVERSION 10
33#define VMWARE_PORT_CMD_GETHZ 45
34
35#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
36 __asm__("inl (%%dx)" : \
37 "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
38 "0"(VMWARE_HYPERVISOR_MAGIC), \
39 "1"(VMWARE_PORT_CMD_##cmd), \
40 "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
41 "memory");
42
43static inline int __vmware_platform(void)
44{
45 uint32_t eax, ebx, ecx, edx;
46 VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx);
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48}
49
50static unsigned long __vmware_get_tsc_khz(void)
51{
52 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx;
54
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56
57 if (ebx == UINT_MAX)
58 return 0;
59 tsc_hz = eax | (((uint64_t)ebx) << 32);
60 do_div(tsc_hz, 1000);
61 BUG_ON(tsc_hz >> 32);
62 return tsc_hz;
63}
64
65/*
66 * While checking the dmi string infomation, just checking the product
67 * serial key should be enough, as this will always have a VMware
68 * specific string when running under VMware hypervisor.
69 */
70int vmware_platform(void)
71{
72 if (cpu_has_hypervisor) {
73 unsigned int eax, ebx, ecx, edx;
74 char hyper_vendor_id[13];
75
76 cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
77 memcpy(hyper_vendor_id + 0, &ebx, 4);
78 memcpy(hyper_vendor_id + 4, &ecx, 4);
79 memcpy(hyper_vendor_id + 8, &edx, 4);
80 hyper_vendor_id[12] = '\0';
81 if (!strcmp(hyper_vendor_id, "VMwareVMware"))
82 return 1;
83 } else if (dmi_available && dmi_name_in_serial("VMware") &&
84 __vmware_platform())
85 return 1;
86
87 return 0;
88}
89
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can
99 * be marked as unstable in some cases. For example, the TSC sync check at
100 * bootup can fail due to a marginal offset between vcpus' TSCs (though the
101 * TSCs do not drift from each other). Also, the ACPI PM timer clocksource
102 * is not suitable as a watchdog when running on a hypervisor because the
103 * kernel may miss a wrap of the counter if the vcpu is descheduled for a
104 * long time. To skip these checks at runtime we set these capability bits,
105 * so that the kernel could just trust the hypervisor with providing a
106 * reliable virtual TSC that is suitable for timekeeping.
107 */
108void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
109{
110 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
111 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
112}
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 19a8c2c0389f..da91701a2348 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -6,13 +6,13 @@
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done) 10 * - buffer overflow handling (to be done)
11 * - buffer access 11 * - buffer access
12 * 12 *
13 * It assumes: 13 * It does not do:
14 * - get_task_struct on all traced tasks 14 * - security checking (is the caller allowed to trace the task)
15 * - current is allowed to trace tasks 15 * - buffer allocation (memory accounting)
16 * 16 *
17 * 17 *
18 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -34,15 +34,30 @@
34 * The configuration for a particular DS hardware implementation. 34 * The configuration for a particular DS hardware implementation.
35 */ 35 */
36struct ds_configuration { 36struct ds_configuration {
37 /* the size of the DS structure in bytes */ 37 /* the name of the configuration */
38 unsigned char sizeof_ds; 38 const char *name;
39 /* the size of one pointer-typed field in the DS structure in bytes; 39 /* the size of one pointer-typed field in the DS structure and
40 this covers the first 8 fields related to buffer management. */ 40 in the BTS and PEBS buffers in bytes;
41 this covers the first 8 DS fields related to buffer management. */
41 unsigned char sizeof_field; 42 unsigned char sizeof_field;
42 /* the size of a BTS/PEBS record in bytes */ 43 /* the size of a BTS/PEBS record in bytes */
43 unsigned char sizeof_rec[2]; 44 unsigned char sizeof_rec[2];
45 /* a series of bit-masks to control various features indexed
46 * by enum ds_feature */
47 unsigned long ctl[dsf_ctl_max];
44}; 48};
45static struct ds_configuration ds_cfg; 49static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
50
51#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
52
53#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
54#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
55#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
56
57#define BTS_CONTROL \
58 (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
59 ds_cfg.ctl[dsf_bts_overflow])
60
46 61
47/* 62/*
48 * A BTS or PEBS tracer. 63 * A BTS or PEBS tracer.
@@ -61,6 +76,8 @@ struct ds_tracer {
61struct bts_tracer { 76struct bts_tracer {
62 /* the common DS part */ 77 /* the common DS part */
63 struct ds_tracer ds; 78 struct ds_tracer ds;
79 /* the trace including the DS configuration */
80 struct bts_trace trace;
64 /* buffer overflow notification function */ 81 /* buffer overflow notification function */
65 bts_ovfl_callback_t ovfl; 82 bts_ovfl_callback_t ovfl;
66}; 83};
@@ -68,6 +85,8 @@ struct bts_tracer {
68struct pebs_tracer { 85struct pebs_tracer {
69 /* the common DS part */ 86 /* the common DS part */
70 struct ds_tracer ds; 87 struct ds_tracer ds;
88 /* the trace including the DS configuration */
89 struct pebs_trace trace;
71 /* buffer overflow notification function */ 90 /* buffer overflow notification function */
72 pebs_ovfl_callback_t ovfl; 91 pebs_ovfl_callback_t ovfl;
73}; 92};
@@ -134,13 +153,11 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
134 (*(unsigned long *)base) = value; 153 (*(unsigned long *)base) = value;
135} 154}
136 155
137#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
138
139 156
140/* 157/*
141 * Locking is done only for allocating BTS or PEBS resources. 158 * Locking is done only for allocating BTS or PEBS resources.
142 */ 159 */
143static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); 160static DEFINE_SPINLOCK(ds_lock);
144 161
145 162
146/* 163/*
@@ -156,27 +173,32 @@ static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
156 * >0 number of per-thread tracers 173 * >0 number of per-thread tracers
157 * <0 number of per-cpu tracers 174 * <0 number of per-cpu tracers
158 * 175 *
159 * The below functions to get and put tracers and to check the
160 * allocation type require the ds_lock to be held by the caller.
161 *
162 * Tracers essentially gives the number of ds contexts for a certain 176 * Tracers essentially gives the number of ds contexts for a certain
163 * type of allocation. 177 * type of allocation.
164 */ 178 */
165static long tracers; 179static atomic_t tracers = ATOMIC_INIT(0);
166 180
167static inline void get_tracer(struct task_struct *task) 181static inline void get_tracer(struct task_struct *task)
168{ 182{
169 tracers += (task ? 1 : -1); 183 if (task)
184 atomic_inc(&tracers);
185 else
186 atomic_dec(&tracers);
170} 187}
171 188
172static inline void put_tracer(struct task_struct *task) 189static inline void put_tracer(struct task_struct *task)
173{ 190{
174 tracers -= (task ? 1 : -1); 191 if (task)
192 atomic_dec(&tracers);
193 else
194 atomic_inc(&tracers);
175} 195}
176 196
177static inline int check_tracer(struct task_struct *task) 197static inline int check_tracer(struct task_struct *task)
178{ 198{
179 return (task ? (tracers >= 0) : (tracers <= 0)); 199 return task ?
200 (atomic_read(&tracers) >= 0) :
201 (atomic_read(&tracers) <= 0);
180} 202}
181 203
182 204
@@ -190,46 +212,66 @@ static inline int check_tracer(struct task_struct *task)
190 * Contexts are use-counted. They are allocated on first access and 212 * Contexts are use-counted. They are allocated on first access and
191 * deallocated when the last user puts the context. 213 * deallocated when the last user puts the context.
192 */ 214 */
193static DEFINE_PER_CPU(struct ds_context *, system_context); 215struct ds_context {
216 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
217 unsigned char ds[MAX_SIZEOF_DS];
218 /* the owner of the BTS and PEBS configuration, respectively */
219 struct bts_tracer *bts_master;
220 struct pebs_tracer *pebs_master;
221 /* use count */
222 unsigned long count;
223 /* a pointer to the context location inside the thread_struct
224 * or the per_cpu context array */
225 struct ds_context **this;
226 /* a pointer to the task owning this context, or NULL, if the
227 * context is owned by a cpu */
228 struct task_struct *task;
229};
230
231static DEFINE_PER_CPU(struct ds_context *, system_context_array);
232
233#define system_context per_cpu(system_context_array, smp_processor_id())
194 234
195#define this_system_context per_cpu(system_context, smp_processor_id())
196 235
197static inline struct ds_context *ds_get_context(struct task_struct *task) 236static inline struct ds_context *ds_get_context(struct task_struct *task)
198{ 237{
199 struct ds_context **p_context = 238 struct ds_context **p_context =
200 (task ? &task->thread.ds_ctx : &this_system_context); 239 (task ? &task->thread.ds_ctx : &system_context);
201 struct ds_context *context = *p_context; 240 struct ds_context *context = NULL;
241 struct ds_context *new_context = NULL;
202 unsigned long irq; 242 unsigned long irq;
203 243
204 if (!context) { 244 /* Chances are small that we already have a context. */
205 context = kzalloc(sizeof(*context), GFP_KERNEL); 245 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
206 if (!context) 246 if (!new_context)
207 return NULL; 247 return NULL;
208 248
209 spin_lock_irqsave(&ds_lock, irq); 249 spin_lock_irqsave(&ds_lock, irq);
210 250
211 if (*p_context) { 251 context = *p_context;
212 kfree(context); 252 if (!context) {
253 context = new_context;
213 254
214 context = *p_context; 255 context->this = p_context;
215 } else { 256 context->task = task;
216 *p_context = context; 257 context->count = 0;
217 258
218 context->this = p_context; 259 if (task)
219 context->task = task; 260 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
220 261
221 if (task) 262 if (!task || (task == current))
222 set_tsk_thread_flag(task, TIF_DS_AREA_MSR); 263 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
223 264
224 if (!task || (task == current)) 265 *p_context = context;
225 wrmsrl(MSR_IA32_DS_AREA,
226 (unsigned long)context->ds);
227 }
228 spin_unlock_irqrestore(&ds_lock, irq);
229 } 266 }
230 267
231 context->count++; 268 context->count++;
232 269
270 spin_unlock_irqrestore(&ds_lock, irq);
271
272 if (context != new_context)
273 kfree(new_context);
274
233 return context; 275 return context;
234} 276}
235 277
@@ -242,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context)
242 284
243 spin_lock_irqsave(&ds_lock, irq); 285 spin_lock_irqsave(&ds_lock, irq);
244 286
245 if (--context->count) 287 if (--context->count) {
246 goto out; 288 spin_unlock_irqrestore(&ds_lock, irq);
289 return;
290 }
247 291
248 *(context->this) = NULL; 292 *(context->this) = NULL;
249 293
@@ -253,14 +297,14 @@ static inline void ds_put_context(struct ds_context *context)
253 if (!context->task || (context->task == current)) 297 if (!context->task || (context->task == current))
254 wrmsrl(MSR_IA32_DS_AREA, 0); 298 wrmsrl(MSR_IA32_DS_AREA, 0);
255 299
256 kfree(context);
257 out:
258 spin_unlock_irqrestore(&ds_lock, irq); 300 spin_unlock_irqrestore(&ds_lock, irq);
301
302 kfree(context);
259} 303}
260 304
261 305
262/* 306/*
263 * Handle a buffer overflow 307 * Call the tracer's callback on a buffer overflow.
264 * 308 *
265 * context: the ds context 309 * context: the ds context
266 * qual: the buffer type 310 * qual: the buffer type
@@ -268,30 +312,247 @@ static inline void ds_put_context(struct ds_context *context)
268static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) 312static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
269{ 313{
270 switch (qual) { 314 switch (qual) {
271 case ds_bts: { 315 case ds_bts:
272 struct bts_tracer *tracer = 316 if (context->bts_master &&
273 container_of(context->owner[qual], 317 context->bts_master->ovfl)
274 struct bts_tracer, ds); 318 context->bts_master->ovfl(context->bts_master);
275 if (tracer->ovfl)
276 tracer->ovfl(tracer);
277 }
278 break; 319 break;
279 case ds_pebs: { 320 case ds_pebs:
280 struct pebs_tracer *tracer = 321 if (context->pebs_master &&
281 container_of(context->owner[qual], 322 context->pebs_master->ovfl)
282 struct pebs_tracer, ds); 323 context->pebs_master->ovfl(context->pebs_master);
283 if (tracer->ovfl) 324 break;
284 tracer->ovfl(tracer); 325 }
326}
327
328
329/*
330 * Write raw data into the BTS or PEBS buffer.
331 *
332 * The remainder of any partially written record is zeroed out.
333 *
334 * context: the DS context
335 * qual: the buffer type
336 * record: the data to write
337 * size: the size of the data
338 */
339static int ds_write(struct ds_context *context, enum ds_qualifier qual,
340 const void *record, size_t size)
341{
342 int bytes_written = 0;
343
344 if (!record)
345 return -EINVAL;
346
347 while (size) {
348 unsigned long base, index, end, write_end, int_th;
349 unsigned long write_size, adj_write_size;
350
351 /*
352 * write as much as possible without producing an
353 * overflow interrupt.
354 *
355 * interrupt_threshold must either be
356 * - bigger than absolute_maximum or
357 * - point to a record between buffer_base and absolute_maximum
358 *
359 * index points to a valid record.
360 */
361 base = ds_get(context->ds, qual, ds_buffer_base);
362 index = ds_get(context->ds, qual, ds_index);
363 end = ds_get(context->ds, qual, ds_absolute_maximum);
364 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
365
366 write_end = min(end, int_th);
367
368 /* if we are already beyond the interrupt threshold,
369 * we fill the entire buffer */
370 if (write_end <= index)
371 write_end = end;
372
373 if (write_end <= index)
374 break;
375
376 write_size = min((unsigned long) size, write_end - index);
377 memcpy((void *)index, record, write_size);
378
379 record = (const char *)record + write_size;
380 size -= write_size;
381 bytes_written += write_size;
382
383 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
384 adj_write_size *= ds_cfg.sizeof_rec[qual];
385
386 /* zero out trailing bytes */
387 memset((char *)index + write_size, 0,
388 adj_write_size - write_size);
389 index += adj_write_size;
390
391 if (index >= end)
392 index = base;
393 ds_set(context->ds, qual, ds_index, index);
394
395 if (index >= int_th)
396 ds_overflow(context, qual);
397 }
398
399 return bytes_written;
400}
401
402
403/*
404 * Branch Trace Store (BTS) uses the following format. Different
405 * architectures vary in the size of those fields.
406 * - source linear address
407 * - destination linear address
408 * - flags
409 *
410 * Later architectures use 64bit pointers throughout, whereas earlier
411 * architectures use 32bit pointers in 32bit mode.
412 *
413 * We compute the base address for the first 8 fields based on:
414 * - the field size stored in the DS configuration
415 * - the relative field position
416 *
417 * In order to store additional information in the BTS buffer, we use
418 * a special source address to indicate that the record requires
419 * special interpretation.
420 *
421 * Netburst indicated via a bit in the flags field whether the branch
422 * was predicted; this is ignored.
423 *
424 * We use two levels of abstraction:
425 * - the raw data level defined here
426 * - an arch-independent level defined in ds.h
427 */
428
429enum bts_field {
430 bts_from,
431 bts_to,
432 bts_flags,
433
434 bts_qual = bts_from,
435 bts_jiffies = bts_to,
436 bts_pid = bts_flags,
437
438 bts_qual_mask = (bts_qual_max - 1),
439 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
440};
441
442static inline unsigned long bts_get(const char *base, enum bts_field field)
443{
444 base += (ds_cfg.sizeof_field * field);
445 return *(unsigned long *)base;
446}
447
448static inline void bts_set(char *base, enum bts_field field, unsigned long val)
449{
450 base += (ds_cfg.sizeof_field * field);;
451 (*(unsigned long *)base) = val;
452}
453
454
455/*
456 * The raw BTS data is architecture dependent.
457 *
458 * For higher-level users, we give an arch-independent view.
459 * - ds.h defines struct bts_struct
460 * - bts_read translates one raw bts record into a bts_struct
461 * - bts_write translates one bts_struct into the raw format and
462 * writes it into the top of the parameter tracer's buffer.
463 *
464 * return: bytes read/written on success; -Eerrno, otherwise
465 */
466static int bts_read(struct bts_tracer *tracer, const void *at,
467 struct bts_struct *out)
468{
469 if (!tracer)
470 return -EINVAL;
471
472 if (at < tracer->trace.ds.begin)
473 return -EINVAL;
474
475 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
476 return -EINVAL;
477
478 memset(out, 0, sizeof(*out));
479 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
480 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
481 out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
482 out->variant.timestamp.pid = bts_get(at, bts_pid);
483 } else {
484 out->qualifier = bts_branch;
485 out->variant.lbr.from = bts_get(at, bts_from);
486 out->variant.lbr.to = bts_get(at, bts_to);
487
488 if (!out->variant.lbr.from && !out->variant.lbr.to)
489 out->qualifier = bts_invalid;
285 } 490 }
491
492 return ds_cfg.sizeof_rec[ds_bts];
493}
494
495static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
496{
497 unsigned char raw[MAX_SIZEOF_BTS];
498
499 if (!tracer)
500 return -EINVAL;
501
502 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
503 return -EOVERFLOW;
504
505 switch (in->qualifier) {
506 case bts_invalid:
507 bts_set(raw, bts_from, 0);
508 bts_set(raw, bts_to, 0);
509 bts_set(raw, bts_flags, 0);
510 break;
511 case bts_branch:
512 bts_set(raw, bts_from, in->variant.lbr.from);
513 bts_set(raw, bts_to, in->variant.lbr.to);
514 bts_set(raw, bts_flags, 0);
286 break; 515 break;
516 case bts_task_arrives:
517 case bts_task_departs:
518 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
519 bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
520 bts_set(raw, bts_pid, in->variant.timestamp.pid);
521 break;
522 default:
523 return -EINVAL;
287 } 524 }
525
526 return ds_write(tracer->ds.context, ds_bts, raw,
527 ds_cfg.sizeof_rec[ds_bts]);
288} 528}
289 529
290 530
291static void ds_install_ds_config(struct ds_context *context, 531static void ds_write_config(struct ds_context *context,
292 enum ds_qualifier qual, 532 struct ds_trace *cfg, enum ds_qualifier qual)
293 void *base, size_t size, size_t ith)
294{ 533{
534 unsigned char *ds = context->ds;
535
536 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
537 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
538 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
539 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
540}
541
542static void ds_read_config(struct ds_context *context,
543 struct ds_trace *cfg, enum ds_qualifier qual)
544{
545 unsigned char *ds = context->ds;
546
547 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
548 cfg->top = (void *)ds_get(ds, qual, ds_index);
549 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
550 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
551}
552
553static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
554 void *base, size_t size, size_t ith,
555 unsigned int flags) {
295 unsigned long buffer, adj; 556 unsigned long buffer, adj;
296 557
297 /* adjust the buffer address and size to meet alignment 558 /* adjust the buffer address and size to meet alignment
@@ -308,32 +569,30 @@ static void ds_install_ds_config(struct ds_context *context,
308 buffer += adj; 569 buffer += adj;
309 size -= adj; 570 size -= adj;
310 571
311 size /= ds_cfg.sizeof_rec[qual]; 572 trace->n = size / ds_cfg.sizeof_rec[qual];
312 size *= ds_cfg.sizeof_rec[qual]; 573 trace->size = ds_cfg.sizeof_rec[qual];
313 574
314 ds_set(context->ds, qual, ds_buffer_base, buffer); 575 size = (trace->n * trace->size);
315 ds_set(context->ds, qual, ds_index, buffer);
316 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
317 576
577 trace->begin = (void *)buffer;
578 trace->top = trace->begin;
579 trace->end = (void *)(buffer + size);
318 /* The value for 'no threshold' is -1, which will set the 580 /* The value for 'no threshold' is -1, which will set the
319 * threshold outside of the buffer, just like we want it. 581 * threshold outside of the buffer, just like we want it.
320 */ 582 */
321 ds_set(context->ds, qual, 583 trace->ith = (void *)(buffer + size - ith);
322 ds_interrupt_threshold, buffer + size - ith); 584
585 trace->flags = flags;
323} 586}
324 587
325static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, 588
326 struct task_struct *task, 589static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
327 void *base, size_t size, size_t th) 590 enum ds_qualifier qual, struct task_struct *task,
591 void *base, size_t size, size_t th, unsigned int flags)
328{ 592{
329 struct ds_context *context; 593 struct ds_context *context;
330 unsigned long irq;
331 int error; 594 int error;
332 595
333 error = -EOPNOTSUPP;
334 if (!ds_cfg.sizeof_ds)
335 goto out;
336
337 error = -EINVAL; 596 error = -EINVAL;
338 if (!base) 597 if (!base)
339 goto out; 598 goto out;
@@ -360,43 +619,26 @@ static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
360 goto out; 619 goto out;
361 tracer->context = context; 620 tracer->context = context;
362 621
622 ds_init_ds_trace(trace, qual, base, size, th, flags);
363 623
364 spin_lock_irqsave(&ds_lock, irq); 624 error = 0;
365
366 error = -EPERM;
367 if (!check_tracer(task))
368 goto out_unlock;
369 get_tracer(task);
370
371 error = -EPERM;
372 if (context->owner[qual])
373 goto out_put_tracer;
374 context->owner[qual] = tracer;
375
376 spin_unlock_irqrestore(&ds_lock, irq);
377
378
379 ds_install_ds_config(context, qual, base, size, th);
380
381 return 0;
382
383 out_put_tracer:
384 put_tracer(task);
385 out_unlock:
386 spin_unlock_irqrestore(&ds_lock, irq);
387 ds_put_context(context);
388 tracer->context = NULL;
389 out: 625 out:
390 return error; 626 return error;
391} 627}
392 628
393struct bts_tracer *ds_request_bts(struct task_struct *task, 629struct bts_tracer *ds_request_bts(struct task_struct *task,
394 void *base, size_t size, 630 void *base, size_t size,
395 bts_ovfl_callback_t ovfl, size_t th) 631 bts_ovfl_callback_t ovfl, size_t th,
632 unsigned int flags)
396{ 633{
397 struct bts_tracer *tracer; 634 struct bts_tracer *tracer;
635 unsigned long irq;
398 int error; 636 int error;
399 637
638 error = -EOPNOTSUPP;
639 if (!ds_cfg.ctl[dsf_bts])
640 goto out;
641
400 /* buffer overflow notification is not yet implemented */ 642 /* buffer overflow notification is not yet implemented */
401 error = -EOPNOTSUPP; 643 error = -EOPNOTSUPP;
402 if (ovfl) 644 if (ovfl)
@@ -408,12 +650,40 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
408 goto out; 650 goto out;
409 tracer->ovfl = ovfl; 651 tracer->ovfl = ovfl;
410 652
411 error = ds_request(&tracer->ds, ds_bts, task, base, size, th); 653 error = ds_request(&tracer->ds, &tracer->trace.ds,
654 ds_bts, task, base, size, th, flags);
412 if (error < 0) 655 if (error < 0)
413 goto out_tracer; 656 goto out_tracer;
414 657
658
659 spin_lock_irqsave(&ds_lock, irq);
660
661 error = -EPERM;
662 if (!check_tracer(task))
663 goto out_unlock;
664 get_tracer(task);
665
666 error = -EPERM;
667 if (tracer->ds.context->bts_master)
668 goto out_put_tracer;
669 tracer->ds.context->bts_master = tracer;
670
671 spin_unlock_irqrestore(&ds_lock, irq);
672
673
674 tracer->trace.read = bts_read;
675 tracer->trace.write = bts_write;
676
677 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
678 ds_resume_bts(tracer);
679
415 return tracer; 680 return tracer;
416 681
682 out_put_tracer:
683 put_tracer(task);
684 out_unlock:
685 spin_unlock_irqrestore(&ds_lock, irq);
686 ds_put_context(tracer->ds.context);
417 out_tracer: 687 out_tracer:
418 kfree(tracer); 688 kfree(tracer);
419 out: 689 out:
@@ -422,9 +692,11 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
422 692
423struct pebs_tracer *ds_request_pebs(struct task_struct *task, 693struct pebs_tracer *ds_request_pebs(struct task_struct *task,
424 void *base, size_t size, 694 void *base, size_t size,
425 pebs_ovfl_callback_t ovfl, size_t th) 695 pebs_ovfl_callback_t ovfl, size_t th,
696 unsigned int flags)
426{ 697{
427 struct pebs_tracer *tracer; 698 struct pebs_tracer *tracer;
699 unsigned long irq;
428 int error; 700 int error;
429 701
430 /* buffer overflow notification is not yet implemented */ 702 /* buffer overflow notification is not yet implemented */
@@ -438,300 +710,171 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
438 goto out; 710 goto out;
439 tracer->ovfl = ovfl; 711 tracer->ovfl = ovfl;
440 712
441 error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); 713 error = ds_request(&tracer->ds, &tracer->trace.ds,
714 ds_pebs, task, base, size, th, flags);
442 if (error < 0) 715 if (error < 0)
443 goto out_tracer; 716 goto out_tracer;
444 717
718 spin_lock_irqsave(&ds_lock, irq);
719
720 error = -EPERM;
721 if (!check_tracer(task))
722 goto out_unlock;
723 get_tracer(task);
724
725 error = -EPERM;
726 if (tracer->ds.context->pebs_master)
727 goto out_put_tracer;
728 tracer->ds.context->pebs_master = tracer;
729
730 spin_unlock_irqrestore(&ds_lock, irq);
731
732 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
733 ds_resume_pebs(tracer);
734
445 return tracer; 735 return tracer;
446 736
737 out_put_tracer:
738 put_tracer(task);
739 out_unlock:
740 spin_unlock_irqrestore(&ds_lock, irq);
741 ds_put_context(tracer->ds.context);
447 out_tracer: 742 out_tracer:
448 kfree(tracer); 743 kfree(tracer);
449 out: 744 out:
450 return ERR_PTR(error); 745 return ERR_PTR(error);
451} 746}
452 747
453static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual) 748void ds_release_bts(struct bts_tracer *tracer)
454{
455 BUG_ON(tracer->context->owner[qual] != tracer);
456 tracer->context->owner[qual] = NULL;
457
458 put_tracer(tracer->context->task);
459 ds_put_context(tracer->context);
460}
461
462int ds_release_bts(struct bts_tracer *tracer)
463{ 749{
464 if (!tracer) 750 if (!tracer)
465 return -EINVAL; 751 return;
466 752
467 ds_release(&tracer->ds, ds_bts); 753 ds_suspend_bts(tracer);
468 kfree(tracer);
469 754
470 return 0; 755 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
471} 756 tracer->ds.context->bts_master = NULL;
472 757
473int ds_release_pebs(struct pebs_tracer *tracer) 758 put_tracer(tracer->ds.context->task);
474{ 759 ds_put_context(tracer->ds.context);
475 if (!tracer)
476 return -EINVAL;
477 760
478 ds_release(&tracer->ds, ds_pebs);
479 kfree(tracer); 761 kfree(tracer);
480
481 return 0;
482} 762}
483 763
484static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) 764void ds_suspend_bts(struct bts_tracer *tracer)
485{ 765{
486 unsigned long base, index; 766 struct task_struct *task;
487
488 base = ds_get(context->ds, qual, ds_buffer_base);
489 index = ds_get(context->ds, qual, ds_index);
490
491 return (index - base) / ds_cfg.sizeof_rec[qual];
492}
493 767
494int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
495{
496 if (!tracer) 768 if (!tracer)
497 return -EINVAL; 769 return;
498
499 if (!pos)
500 return -EINVAL;
501
502 *pos = ds_get_index(tracer->ds.context, ds_bts);
503
504 return 0;
505}
506 770
507int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos) 771 task = tracer->ds.context->task;
508{
509 if (!tracer)
510 return -EINVAL;
511 772
512 if (!pos) 773 if (!task || (task == current))
513 return -EINVAL; 774 update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
514 775
515 *pos = ds_get_index(tracer->ds.context, ds_pebs); 776 if (task) {
777 task->thread.debugctlmsr &= ~BTS_CONTROL;
516 778
517 return 0; 779 if (!task->thread.debugctlmsr)
780 clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
781 }
518} 782}
519 783
520static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual) 784void ds_resume_bts(struct bts_tracer *tracer)
521{ 785{
522 unsigned long base, max; 786 struct task_struct *task;
787 unsigned long control;
523 788
524 base = ds_get(context->ds, qual, ds_buffer_base);
525 max = ds_get(context->ds, qual, ds_absolute_maximum);
526
527 return (max - base) / ds_cfg.sizeof_rec[qual];
528}
529
530int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
531{
532 if (!tracer) 789 if (!tracer)
533 return -EINVAL; 790 return;
534
535 if (!pos)
536 return -EINVAL;
537
538 *pos = ds_get_end(tracer->ds.context, ds_bts);
539
540 return 0;
541}
542
543int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
544{
545 if (!tracer)
546 return -EINVAL;
547
548 if (!pos)
549 return -EINVAL;
550
551 *pos = ds_get_end(tracer->ds.context, ds_pebs);
552
553 return 0;
554}
555
556static int ds_access(struct ds_context *context, enum ds_qualifier qual,
557 size_t index, const void **record)
558{
559 unsigned long base, idx;
560
561 if (!record)
562 return -EINVAL;
563
564 base = ds_get(context->ds, qual, ds_buffer_base);
565 idx = base + (index * ds_cfg.sizeof_rec[qual]);
566
567 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
568 return -EINVAL;
569 791
570 *record = (const void *)idx; 792 task = tracer->ds.context->task;
571 793
572 return ds_cfg.sizeof_rec[qual]; 794 control = ds_cfg.ctl[dsf_bts];
573} 795 if (!(tracer->trace.ds.flags & BTS_KERNEL))
796 control |= ds_cfg.ctl[dsf_bts_kernel];
797 if (!(tracer->trace.ds.flags & BTS_USER))
798 control |= ds_cfg.ctl[dsf_bts_user];
574 799
575int ds_access_bts(struct bts_tracer *tracer, size_t index, 800 if (task) {
576 const void **record) 801 task->thread.debugctlmsr |= control;
577{ 802 set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
578 if (!tracer) 803 }
579 return -EINVAL;
580 804
581 return ds_access(tracer->ds.context, ds_bts, index, record); 805 if (!task || (task == current))
806 update_debugctlmsr(get_debugctlmsr() | control);
582} 807}
583 808
584int ds_access_pebs(struct pebs_tracer *tracer, size_t index, 809void ds_release_pebs(struct pebs_tracer *tracer)
585 const void **record)
586{ 810{
587 if (!tracer) 811 if (!tracer)
588 return -EINVAL; 812 return;
589
590 return ds_access(tracer->ds.context, ds_pebs, index, record);
591}
592
593static int ds_write(struct ds_context *context, enum ds_qualifier qual,
594 const void *record, size_t size)
595{
596 int bytes_written = 0;
597
598 if (!record)
599 return -EINVAL;
600
601 while (size) {
602 unsigned long base, index, end, write_end, int_th;
603 unsigned long write_size, adj_write_size;
604
605 /*
606 * write as much as possible without producing an
607 * overflow interrupt.
608 *
609 * interrupt_threshold must either be
610 * - bigger than absolute_maximum or
611 * - point to a record between buffer_base and absolute_maximum
612 *
613 * index points to a valid record.
614 */
615 base = ds_get(context->ds, qual, ds_buffer_base);
616 index = ds_get(context->ds, qual, ds_index);
617 end = ds_get(context->ds, qual, ds_absolute_maximum);
618 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
619
620 write_end = min(end, int_th);
621
622 /* if we are already beyond the interrupt threshold,
623 * we fill the entire buffer */
624 if (write_end <= index)
625 write_end = end;
626
627 if (write_end <= index)
628 break;
629
630 write_size = min((unsigned long) size, write_end - index);
631 memcpy((void *)index, record, write_size);
632
633 record = (const char *)record + write_size;
634 size -= write_size;
635 bytes_written += write_size;
636
637 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
638 adj_write_size *= ds_cfg.sizeof_rec[qual];
639
640 /* zero out trailing bytes */
641 memset((char *)index + write_size, 0,
642 adj_write_size - write_size);
643 index += adj_write_size;
644
645 if (index >= end)
646 index = base;
647 ds_set(context->ds, qual, ds_index, index);
648 813
649 if (index >= int_th) 814 ds_suspend_pebs(tracer);
650 ds_overflow(context, qual);
651 }
652 815
653 return bytes_written; 816 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
654} 817 tracer->ds.context->pebs_master = NULL;
655 818
656int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size) 819 put_tracer(tracer->ds.context->task);
657{ 820 ds_put_context(tracer->ds.context);
658 if (!tracer)
659 return -EINVAL;
660 821
661 return ds_write(tracer->ds.context, ds_bts, record, size); 822 kfree(tracer);
662} 823}
663 824
664int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size) 825void ds_suspend_pebs(struct pebs_tracer *tracer)
665{ 826{
666 if (!tracer)
667 return -EINVAL;
668 827
669 return ds_write(tracer->ds.context, ds_pebs, record, size);
670} 828}
671 829
672static void ds_reset_or_clear(struct ds_context *context, 830void ds_resume_pebs(struct pebs_tracer *tracer)
673 enum ds_qualifier qual, int clear)
674{ 831{
675 unsigned long base, end;
676
677 base = ds_get(context->ds, qual, ds_buffer_base);
678 end = ds_get(context->ds, qual, ds_absolute_maximum);
679 832
680 if (clear)
681 memset((void *)base, 0, end - base);
682
683 ds_set(context->ds, qual, ds_index, base);
684} 833}
685 834
686int ds_reset_bts(struct bts_tracer *tracer) 835const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
687{ 836{
688 if (!tracer) 837 if (!tracer)
689 return -EINVAL; 838 return NULL;
690 839
691 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); 840 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
692 841 return &tracer->trace;
693 return 0;
694} 842}
695 843
696int ds_reset_pebs(struct pebs_tracer *tracer) 844const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
697{ 845{
698 if (!tracer) 846 if (!tracer)
699 return -EINVAL; 847 return NULL;
700 848
701 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); 849 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
850 tracer->trace.reset_value =
851 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
702 852
703 return 0; 853 return &tracer->trace;
704} 854}
705 855
706int ds_clear_bts(struct bts_tracer *tracer) 856int ds_reset_bts(struct bts_tracer *tracer)
707{ 857{
708 if (!tracer) 858 if (!tracer)
709 return -EINVAL; 859 return -EINVAL;
710 860
711 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); 861 tracer->trace.ds.top = tracer->trace.ds.begin;
712
713 return 0;
714}
715
716int ds_clear_pebs(struct pebs_tracer *tracer)
717{
718 if (!tracer)
719 return -EINVAL;
720 862
721 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); 863 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
864 (unsigned long)tracer->trace.ds.top);
722 865
723 return 0; 866 return 0;
724} 867}
725 868
726int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value) 869int ds_reset_pebs(struct pebs_tracer *tracer)
727{ 870{
728 if (!tracer) 871 if (!tracer)
729 return -EINVAL; 872 return -EINVAL;
730 873
731 if (!value) 874 tracer->trace.ds.top = tracer->trace.ds.begin;
732 return -EINVAL;
733 875
734 *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); 876 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
877 (unsigned long)tracer->trace.ds.top);
735 878
736 return 0; 879 return 0;
737} 880}
@@ -746,35 +889,59 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
746 return 0; 889 return 0;
747} 890}
748 891
749static const struct ds_configuration ds_cfg_var = { 892static const struct ds_configuration ds_cfg_netburst = {
750 .sizeof_ds = sizeof(long) * 12, 893 .name = "netburst",
751 .sizeof_field = sizeof(long), 894 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
752 .sizeof_rec[ds_bts] = sizeof(long) * 3, 895 .ctl[dsf_bts_kernel] = (1 << 5),
896 .ctl[dsf_bts_user] = (1 << 6),
897
898 .sizeof_field = sizeof(long),
899 .sizeof_rec[ds_bts] = sizeof(long) * 3,
753#ifdef __i386__ 900#ifdef __i386__
754 .sizeof_rec[ds_pebs] = sizeof(long) * 10 901 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
755#else 902#else
756 .sizeof_rec[ds_pebs] = sizeof(long) * 18 903 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
757#endif 904#endif
758}; 905};
759static const struct ds_configuration ds_cfg_64 = { 906static const struct ds_configuration ds_cfg_pentium_m = {
760 .sizeof_ds = 8 * 12, 907 .name = "pentium m",
761 .sizeof_field = 8, 908 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
762 .sizeof_rec[ds_bts] = 8 * 3, 909
910 .sizeof_field = sizeof(long),
911 .sizeof_rec[ds_bts] = sizeof(long) * 3,
763#ifdef __i386__ 912#ifdef __i386__
764 .sizeof_rec[ds_pebs] = 8 * 10 913 .sizeof_rec[ds_pebs] = sizeof(long) * 10,
765#else 914#else
766 .sizeof_rec[ds_pebs] = 8 * 18 915 .sizeof_rec[ds_pebs] = sizeof(long) * 18,
767#endif 916#endif
768}; 917};
918static const struct ds_configuration ds_cfg_core2 = {
919 .name = "core 2",
920 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
921 .ctl[dsf_bts_kernel] = (1 << 9),
922 .ctl[dsf_bts_user] = (1 << 10),
923
924 .sizeof_field = 8,
925 .sizeof_rec[ds_bts] = 8 * 3,
926 .sizeof_rec[ds_pebs] = 8 * 18,
927};
769 928
770static inline void 929static void
771ds_configure(const struct ds_configuration *cfg) 930ds_configure(const struct ds_configuration *cfg)
772{ 931{
932 memset(&ds_cfg, 0, sizeof(ds_cfg));
773 ds_cfg = *cfg; 933 ds_cfg = *cfg;
774 934
775 printk(KERN_INFO "DS available\n"); 935 printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
936
937 if (!cpu_has_bts) {
938 ds_cfg.ctl[dsf_bts] = 0;
939 printk(KERN_INFO "[ds] bts not available\n");
940 }
941 if (!cpu_has_pebs)
942 printk(KERN_INFO "[ds] pebs not available\n");
776 943
777 BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds); 944 WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
778} 945}
779 946
780void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 947void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -787,10 +954,10 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
787 break; 954 break;
788 case 0xD: 955 case 0xD:
789 case 0xE: /* Pentium M */ 956 case 0xE: /* Pentium M */
790 ds_configure(&ds_cfg_var); 957 ds_configure(&ds_cfg_pentium_m);
791 break; 958 break;
792 default: /* Core2, Atom, ... */ 959 default: /* Core2, Atom, ... */
793 ds_configure(&ds_cfg_64); 960 ds_configure(&ds_cfg_core2);
794 break; 961 break;
795 } 962 }
796 break; 963 break;
@@ -799,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
799 case 0x0: 966 case 0x0:
800 case 0x1: 967 case 0x1:
801 case 0x2: /* Netburst */ 968 case 0x2: /* Netburst */
802 ds_configure(&ds_cfg_var); 969 ds_configure(&ds_cfg_netburst);
803 break; 970 break;
804 default: 971 default:
805 /* sorry, don't know about them */ 972 /* sorry, don't know about them */
@@ -812,14 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
812 } 979 }
813} 980}
814 981
815void ds_free(struct ds_context *context) 982/*
983 * Change the DS configuration from tracing prev to tracing next.
984 */
985void ds_switch_to(struct task_struct *prev, struct task_struct *next)
816{ 986{
817 /* This is called when the task owning the parameter context 987 struct ds_context *prev_ctx = prev->thread.ds_ctx;
818 * is dying. There should not be any user of that context left 988 struct ds_context *next_ctx = next->thread.ds_ctx;
819 * to disturb us, anymore. */ 989
820 unsigned long leftovers = context->count; 990 if (prev_ctx) {
821 while (leftovers--) { 991 update_debugctlmsr(0);
822 put_tracer(context->task); 992
823 ds_put_context(context); 993 if (prev_ctx->bts_master &&
994 (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
995 struct bts_struct ts = {
996 .qualifier = bts_task_departs,
997 .variant.timestamp.jiffies = jiffies_64,
998 .variant.timestamp.pid = prev->pid
999 };
1000 bts_write(prev_ctx->bts_master, &ts);
1001 }
1002 }
1003
1004 if (next_ctx) {
1005 if (next_ctx->bts_master &&
1006 (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
1007 struct bts_struct ts = {
1008 .qualifier = bts_task_arrives,
1009 .variant.timestamp.jiffies = jiffies_64,
1010 .variant.timestamp.pid = next->pid
1011 };
1012 bts_write(next_ctx->bts_master, &ts);
1013 }
1014
1015 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
824 } 1016 }
1017
1018 update_debugctlmsr(next->thread.debugctlmsr);
1019}
1020
1021void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
1022{
1023 clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
1024 tsk->thread.ds_ctx = NULL;
1025}
1026
1027void ds_exit_thread(struct task_struct *tsk)
1028{
1029 WARN_ON(tsk->thread.ds_ctx);
825} 1030}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7aafeb5263ef..65a13943e098 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -677,22 +677,6 @@ struct early_res {
677}; 677};
678static struct early_res early_res[MAX_EARLY_RES] __initdata = { 678static struct early_res early_res[MAX_EARLY_RES] __initdata = {
679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */ 679 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
680#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
681 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
682#endif
683#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
684 /*
685 * But first pinch a few for the stack/trampoline stuff
686 * FIXME: Don't need the extra page at 4K, but need to fix
687 * trampoline before removing it. (see the GDT stuff)
688 */
689 { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
690 /*
691 * Has to be in very low memory so we can execute
692 * real-mode AP code.
693 */
694 { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
695#endif
696 {} 680 {}
697}; 681};
698 682
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 1b894b72c0f5..744aa7fc49d5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,6 +17,7 @@
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <asm/iommu.h> 19#include <asm/iommu.h>
20#include <asm/gart.h>
20 21
21static void __init fix_hypertransport_config(int num, int slot, int func) 22static void __init fix_hypertransport_config(int num, int slot, int func)
22{ 23{
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 34ad997d3834..23b138e31e9c 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -875,49 +875,6 @@ static struct console early_dbgp_console = {
875}; 875};
876#endif 876#endif
877 877
878/* Console interface to a host file on AMD's SimNow! */
879
880static int simnow_fd;
881
882enum {
883 MAGIC1 = 0xBACCD00A,
884 MAGIC2 = 0xCA110000,
885 XOPEN = 5,
886 XWRITE = 4,
887};
888
889static noinline long simnow(long cmd, long a, long b, long c)
890{
891 long ret;
892
893 asm volatile("cpuid" :
894 "=a" (ret) :
895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
896 return ret;
897}
898
899static void __init simnow_init(char *str)
900{
901 char *fn = "klog";
902
903 if (*str == '=')
904 fn = ++str;
905 /* error ignored */
906 simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644);
907}
908
909static void simnow_write(struct console *con, const char *s, unsigned n)
910{
911 simnow(XWRITE, simnow_fd, (unsigned long)s, n);
912}
913
914static struct console simnow_console = {
915 .name = "simnow",
916 .write = simnow_write,
917 .flags = CON_PRINTBUFFER,
918 .index = -1,
919};
920
921/* Direct interface for emergencies */ 878/* Direct interface for emergencies */
922static struct console *early_console = &early_vga_console; 879static struct console *early_console = &early_vga_console;
923static int __initdata early_console_initialized; 880static int __initdata early_console_initialized;
@@ -960,10 +917,6 @@ static int __init setup_early_printk(char *buf)
960 max_ypos = boot_params.screen_info.orig_video_lines; 917 max_ypos = boot_params.screen_info.orig_video_lines;
961 current_ypos = boot_params.screen_info.orig_y; 918 current_ypos = boot_params.screen_info.orig_y;
962 early_console = &early_vga_console; 919 early_console = &early_vga_console;
963 } else if (!strncmp(buf, "simnow", 6)) {
964 simnow_init(buf + 6);
965 early_console = &simnow_console;
966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP 920#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) { 921 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0) 922 if (early_dbgp_init(buf+4) < 0)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 43ceb3f454bf..d6f0490a7391 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -619,28 +619,37 @@ END(syscall_badsys)
61927:; 61927:;
620 620
621/* 621/*
622 * Build the entry stubs and pointer table with 622 * Build the entry stubs and pointer table with some assembler magic.
623 * some assembler magic. 623 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
624 * single cache line on all modern x86 implementations.
624 */ 625 */
625.section .rodata,"a" 626.section .init.rodata,"a"
626ENTRY(interrupt) 627ENTRY(interrupt)
627.text 628.text
628 629 .p2align 5
630 .p2align CONFIG_X86_L1_CACHE_SHIFT
629ENTRY(irq_entries_start) 631ENTRY(irq_entries_start)
630 RING0_INT_FRAME 632 RING0_INT_FRAME
631vector=0 633vector=FIRST_EXTERNAL_VECTOR
632.rept NR_VECTORS 634.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
633 ALIGN 635 .balign 32
634 .if vector 636 .rept 7
637 .if vector < NR_VECTORS
638 .if vector <> FIRST_EXTERNAL_VECTOR
635 CFI_ADJUST_CFA_OFFSET -4 639 CFI_ADJUST_CFA_OFFSET -4
636 .endif 640 .endif
6371: pushl $~(vector) 6411: pushl $(~vector+0x80) /* Note: always in signed byte range */
638 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
639 jmp common_interrupt 643 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
640 .previous 644 jmp 2f
645 .endif
646 .previous
641 .long 1b 647 .long 1b
642 .text 648 .text
643vector=vector+1 649vector=vector+1
650 .endif
651 .endr
6522: jmp common_interrupt
644.endr 653.endr
645END(irq_entries_start) 654END(irq_entries_start)
646 655
@@ -652,8 +661,9 @@ END(interrupt)
652 * the CPU automatically disables interrupts when executing an IRQ vector, 661 * the CPU automatically disables interrupts when executing an IRQ vector,
653 * so IRQ-flags tracing has to follow that: 662 * so IRQ-flags tracing has to follow that:
654 */ 663 */
655 ALIGN 664 .p2align CONFIG_X86_L1_CACHE_SHIFT
656common_interrupt: 665common_interrupt:
666 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
657 SAVE_ALL 667 SAVE_ALL
658 TRACE_IRQS_OFF 668 TRACE_IRQS_OFF
659 movl %esp,%eax 669 movl %esp,%eax
@@ -678,65 +688,6 @@ ENDPROC(name)
678/* The include is where all of the SMP etc. interrupts come from */ 688/* The include is where all of the SMP etc. interrupts come from */
679#include "entry_arch.h" 689#include "entry_arch.h"
680 690
681KPROBE_ENTRY(page_fault)
682 RING0_EC_FRAME
683 pushl $do_page_fault
684 CFI_ADJUST_CFA_OFFSET 4
685 ALIGN
686error_code:
687 /* the function address is in %fs's slot on the stack */
688 pushl %es
689 CFI_ADJUST_CFA_OFFSET 4
690 /*CFI_REL_OFFSET es, 0*/
691 pushl %ds
692 CFI_ADJUST_CFA_OFFSET 4
693 /*CFI_REL_OFFSET ds, 0*/
694 pushl %eax
695 CFI_ADJUST_CFA_OFFSET 4
696 CFI_REL_OFFSET eax, 0
697 pushl %ebp
698 CFI_ADJUST_CFA_OFFSET 4
699 CFI_REL_OFFSET ebp, 0
700 pushl %edi
701 CFI_ADJUST_CFA_OFFSET 4
702 CFI_REL_OFFSET edi, 0
703 pushl %esi
704 CFI_ADJUST_CFA_OFFSET 4
705 CFI_REL_OFFSET esi, 0
706 pushl %edx
707 CFI_ADJUST_CFA_OFFSET 4
708 CFI_REL_OFFSET edx, 0
709 pushl %ecx
710 CFI_ADJUST_CFA_OFFSET 4
711 CFI_REL_OFFSET ecx, 0
712 pushl %ebx
713 CFI_ADJUST_CFA_OFFSET 4
714 CFI_REL_OFFSET ebx, 0
715 cld
716 pushl %fs
717 CFI_ADJUST_CFA_OFFSET 4
718 /*CFI_REL_OFFSET fs, 0*/
719 movl $(__KERNEL_PERCPU), %ecx
720 movl %ecx, %fs
721 UNWIND_ESPFIX_STACK
722 popl %ecx
723 CFI_ADJUST_CFA_OFFSET -4
724 /*CFI_REGISTER es, ecx*/
725 movl PT_FS(%esp), %edi # get the function address
726 movl PT_ORIG_EAX(%esp), %edx # get the error code
727 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
728 mov %ecx, PT_FS(%esp)
729 /*CFI_REL_OFFSET fs, ES*/
730 movl $(__USER_DS), %ecx
731 movl %ecx, %ds
732 movl %ecx, %es
733 TRACE_IRQS_OFF
734 movl %esp,%eax # pt_regs pointer
735 call *%edi
736 jmp ret_from_exception
737 CFI_ENDPROC
738KPROBE_END(page_fault)
739
740ENTRY(coprocessor_error) 691ENTRY(coprocessor_error)
741 RING0_INT_FRAME 692 RING0_INT_FRAME
742 pushl $0 693 pushl $0
@@ -767,140 +718,6 @@ ENTRY(device_not_available)
767 CFI_ENDPROC 718 CFI_ENDPROC
768END(device_not_available) 719END(device_not_available)
769 720
770/*
771 * Debug traps and NMI can happen at the one SYSENTER instruction
772 * that sets up the real kernel stack. Check here, since we can't
773 * allow the wrong stack to be used.
774 *
775 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
776 * already pushed 3 words if it hits on the sysenter instruction:
777 * eflags, cs and eip.
778 *
779 * We just load the right stack, and push the three (known) values
780 * by hand onto the new stack - while updating the return eip past
781 * the instruction that would have done it for sysenter.
782 */
783#define FIX_STACK(offset, ok, label) \
784 cmpw $__KERNEL_CS,4(%esp); \
785 jne ok; \
786label: \
787 movl TSS_sysenter_sp0+offset(%esp),%esp; \
788 CFI_DEF_CFA esp, 0; \
789 CFI_UNDEFINED eip; \
790 pushfl; \
791 CFI_ADJUST_CFA_OFFSET 4; \
792 pushl $__KERNEL_CS; \
793 CFI_ADJUST_CFA_OFFSET 4; \
794 pushl $sysenter_past_esp; \
795 CFI_ADJUST_CFA_OFFSET 4; \
796 CFI_REL_OFFSET eip, 0
797
798KPROBE_ENTRY(debug)
799 RING0_INT_FRAME
800 cmpl $ia32_sysenter_target,(%esp)
801 jne debug_stack_correct
802 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
803debug_stack_correct:
804 pushl $-1 # mark this as an int
805 CFI_ADJUST_CFA_OFFSET 4
806 SAVE_ALL
807 TRACE_IRQS_OFF
808 xorl %edx,%edx # error code 0
809 movl %esp,%eax # pt_regs pointer
810 call do_debug
811 jmp ret_from_exception
812 CFI_ENDPROC
813KPROBE_END(debug)
814
815/*
816 * NMI is doubly nasty. It can happen _while_ we're handling
817 * a debug fault, and the debug fault hasn't yet been able to
818 * clear up the stack. So we first check whether we got an
819 * NMI on the sysenter entry path, but after that we need to
820 * check whether we got an NMI on the debug path where the debug
821 * fault happened on the sysenter path.
822 */
823KPROBE_ENTRY(nmi)
824 RING0_INT_FRAME
825 pushl %eax
826 CFI_ADJUST_CFA_OFFSET 4
827 movl %ss, %eax
828 cmpw $__ESPFIX_SS, %ax
829 popl %eax
830 CFI_ADJUST_CFA_OFFSET -4
831 je nmi_espfix_stack
832 cmpl $ia32_sysenter_target,(%esp)
833 je nmi_stack_fixup
834 pushl %eax
835 CFI_ADJUST_CFA_OFFSET 4
836 movl %esp,%eax
837 /* Do not access memory above the end of our stack page,
838 * it might not exist.
839 */
840 andl $(THREAD_SIZE-1),%eax
841 cmpl $(THREAD_SIZE-20),%eax
842 popl %eax
843 CFI_ADJUST_CFA_OFFSET -4
844 jae nmi_stack_correct
845 cmpl $ia32_sysenter_target,12(%esp)
846 je nmi_debug_stack_check
847nmi_stack_correct:
848 /* We have a RING0_INT_FRAME here */
849 pushl %eax
850 CFI_ADJUST_CFA_OFFSET 4
851 SAVE_ALL
852 TRACE_IRQS_OFF
853 xorl %edx,%edx # zero error code
854 movl %esp,%eax # pt_regs pointer
855 call do_nmi
856 jmp restore_nocheck_notrace
857 CFI_ENDPROC
858
859nmi_stack_fixup:
860 RING0_INT_FRAME
861 FIX_STACK(12,nmi_stack_correct, 1)
862 jmp nmi_stack_correct
863
864nmi_debug_stack_check:
865 /* We have a RING0_INT_FRAME here */
866 cmpw $__KERNEL_CS,16(%esp)
867 jne nmi_stack_correct
868 cmpl $debug,(%esp)
869 jb nmi_stack_correct
870 cmpl $debug_esp_fix_insn,(%esp)
871 ja nmi_stack_correct
872 FIX_STACK(24,nmi_stack_correct, 1)
873 jmp nmi_stack_correct
874
875nmi_espfix_stack:
876 /* We have a RING0_INT_FRAME here.
877 *
878 * create the pointer to lss back
879 */
880 pushl %ss
881 CFI_ADJUST_CFA_OFFSET 4
882 pushl %esp
883 CFI_ADJUST_CFA_OFFSET 4
884 addw $4, (%esp)
885 /* copy the iret frame of 12 bytes */
886 .rept 3
887 pushl 16(%esp)
888 CFI_ADJUST_CFA_OFFSET 4
889 .endr
890 pushl %eax
891 CFI_ADJUST_CFA_OFFSET 4
892 SAVE_ALL
893 TRACE_IRQS_OFF
894 FIXUP_ESPFIX_STACK # %eax == %esp
895 xorl %edx,%edx # zero error code
896 call do_nmi
897 RESTORE_REGS
898 lss 12+4(%esp), %esp # back to espfix stack
899 CFI_ADJUST_CFA_OFFSET -24
900 jmp irq_return
901 CFI_ENDPROC
902KPROBE_END(nmi)
903
904#ifdef CONFIG_PARAVIRT 721#ifdef CONFIG_PARAVIRT
905ENTRY(native_iret) 722ENTRY(native_iret)
906 iret 723 iret
@@ -916,19 +733,6 @@ ENTRY(native_irq_enable_sysexit)
916END(native_irq_enable_sysexit) 733END(native_irq_enable_sysexit)
917#endif 734#endif
918 735
919KPROBE_ENTRY(int3)
920 RING0_INT_FRAME
921 pushl $-1 # mark this as an int
922 CFI_ADJUST_CFA_OFFSET 4
923 SAVE_ALL
924 TRACE_IRQS_OFF
925 xorl %edx,%edx # zero error code
926 movl %esp,%eax # pt_regs pointer
927 call do_int3
928 jmp ret_from_exception
929 CFI_ENDPROC
930KPROBE_END(int3)
931
932ENTRY(overflow) 736ENTRY(overflow)
933 RING0_INT_FRAME 737 RING0_INT_FRAME
934 pushl $0 738 pushl $0
@@ -993,14 +797,6 @@ ENTRY(stack_segment)
993 CFI_ENDPROC 797 CFI_ENDPROC
994END(stack_segment) 798END(stack_segment)
995 799
996KPROBE_ENTRY(general_protection)
997 RING0_EC_FRAME
998 pushl $do_general_protection
999 CFI_ADJUST_CFA_OFFSET 4
1000 jmp error_code
1001 CFI_ENDPROC
1002KPROBE_END(general_protection)
1003
1004ENTRY(alignment_check) 800ENTRY(alignment_check)
1005 RING0_EC_FRAME 801 RING0_EC_FRAME
1006 pushl $do_alignment_check 802 pushl $do_alignment_check
@@ -1051,6 +847,7 @@ ENTRY(kernel_thread_helper)
1051 push %eax 847 push %eax
1052 CFI_ADJUST_CFA_OFFSET 4 848 CFI_ADJUST_CFA_OFFSET 4
1053 call do_exit 849 call do_exit
850 ud2 # padding for call trace
1054 CFI_ENDPROC 851 CFI_ENDPROC
1055ENDPROC(kernel_thread_helper) 852ENDPROC(kernel_thread_helper)
1056 853
@@ -1259,3 +1056,227 @@ return_to_handler:
1259#include "syscall_table_32.S" 1056#include "syscall_table_32.S"
1260 1057
1261syscall_table_size=(.-sys_call_table) 1058syscall_table_size=(.-sys_call_table)
1059
1060/*
1061 * Some functions should be protected against kprobes
1062 */
1063 .pushsection .kprobes.text, "ax"
1064
1065ENTRY(page_fault)
1066 RING0_EC_FRAME
1067 pushl $do_page_fault
1068 CFI_ADJUST_CFA_OFFSET 4
1069 ALIGN
1070error_code:
1071 /* the function address is in %fs's slot on the stack */
1072 pushl %es
1073 CFI_ADJUST_CFA_OFFSET 4
1074 /*CFI_REL_OFFSET es, 0*/
1075 pushl %ds
1076 CFI_ADJUST_CFA_OFFSET 4
1077 /*CFI_REL_OFFSET ds, 0*/
1078 pushl %eax
1079 CFI_ADJUST_CFA_OFFSET 4
1080 CFI_REL_OFFSET eax, 0
1081 pushl %ebp
1082 CFI_ADJUST_CFA_OFFSET 4
1083 CFI_REL_OFFSET ebp, 0
1084 pushl %edi
1085 CFI_ADJUST_CFA_OFFSET 4
1086 CFI_REL_OFFSET edi, 0
1087 pushl %esi
1088 CFI_ADJUST_CFA_OFFSET 4
1089 CFI_REL_OFFSET esi, 0
1090 pushl %edx
1091 CFI_ADJUST_CFA_OFFSET 4
1092 CFI_REL_OFFSET edx, 0
1093 pushl %ecx
1094 CFI_ADJUST_CFA_OFFSET 4
1095 CFI_REL_OFFSET ecx, 0
1096 pushl %ebx
1097 CFI_ADJUST_CFA_OFFSET 4
1098 CFI_REL_OFFSET ebx, 0
1099 cld
1100 pushl %fs
1101 CFI_ADJUST_CFA_OFFSET 4
1102 /*CFI_REL_OFFSET fs, 0*/
1103 movl $(__KERNEL_PERCPU), %ecx
1104 movl %ecx, %fs
1105 UNWIND_ESPFIX_STACK
1106 popl %ecx
1107 CFI_ADJUST_CFA_OFFSET -4
1108 /*CFI_REGISTER es, ecx*/
1109 movl PT_FS(%esp), %edi # get the function address
1110 movl PT_ORIG_EAX(%esp), %edx # get the error code
1111 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
1112 mov %ecx, PT_FS(%esp)
1113 /*CFI_REL_OFFSET fs, ES*/
1114 movl $(__USER_DS), %ecx
1115 movl %ecx, %ds
1116 movl %ecx, %es
1117 TRACE_IRQS_OFF
1118 movl %esp,%eax # pt_regs pointer
1119 call *%edi
1120 jmp ret_from_exception
1121 CFI_ENDPROC
1122END(page_fault)
1123
1124/*
1125 * Debug traps and NMI can happen at the one SYSENTER instruction
1126 * that sets up the real kernel stack. Check here, since we can't
1127 * allow the wrong stack to be used.
1128 *
1129 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
1130 * already pushed 3 words if it hits on the sysenter instruction:
1131 * eflags, cs and eip.
1132 *
1133 * We just load the right stack, and push the three (known) values
1134 * by hand onto the new stack - while updating the return eip past
1135 * the instruction that would have done it for sysenter.
1136 */
1137#define FIX_STACK(offset, ok, label) \
1138 cmpw $__KERNEL_CS,4(%esp); \
1139 jne ok; \
1140label: \
1141 movl TSS_sysenter_sp0+offset(%esp),%esp; \
1142 CFI_DEF_CFA esp, 0; \
1143 CFI_UNDEFINED eip; \
1144 pushfl; \
1145 CFI_ADJUST_CFA_OFFSET 4; \
1146 pushl $__KERNEL_CS; \
1147 CFI_ADJUST_CFA_OFFSET 4; \
1148 pushl $sysenter_past_esp; \
1149 CFI_ADJUST_CFA_OFFSET 4; \
1150 CFI_REL_OFFSET eip, 0
1151
1152ENTRY(debug)
1153 RING0_INT_FRAME
1154 cmpl $ia32_sysenter_target,(%esp)
1155 jne debug_stack_correct
1156 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
1157debug_stack_correct:
1158 pushl $-1 # mark this as an int
1159 CFI_ADJUST_CFA_OFFSET 4
1160 SAVE_ALL
1161 TRACE_IRQS_OFF
1162 xorl %edx,%edx # error code 0
1163 movl %esp,%eax # pt_regs pointer
1164 call do_debug
1165 jmp ret_from_exception
1166 CFI_ENDPROC
1167END(debug)
1168
1169/*
1170 * NMI is doubly nasty. It can happen _while_ we're handling
1171 * a debug fault, and the debug fault hasn't yet been able to
1172 * clear up the stack. So we first check whether we got an
1173 * NMI on the sysenter entry path, but after that we need to
1174 * check whether we got an NMI on the debug path where the debug
1175 * fault happened on the sysenter path.
1176 */
1177ENTRY(nmi)
1178 RING0_INT_FRAME
1179 pushl %eax
1180 CFI_ADJUST_CFA_OFFSET 4
1181 movl %ss, %eax
1182 cmpw $__ESPFIX_SS, %ax
1183 popl %eax
1184 CFI_ADJUST_CFA_OFFSET -4
1185 je nmi_espfix_stack
1186 cmpl $ia32_sysenter_target,(%esp)
1187 je nmi_stack_fixup
1188 pushl %eax
1189 CFI_ADJUST_CFA_OFFSET 4
1190 movl %esp,%eax
1191 /* Do not access memory above the end of our stack page,
1192 * it might not exist.
1193 */
1194 andl $(THREAD_SIZE-1),%eax
1195 cmpl $(THREAD_SIZE-20),%eax
1196 popl %eax
1197 CFI_ADJUST_CFA_OFFSET -4
1198 jae nmi_stack_correct
1199 cmpl $ia32_sysenter_target,12(%esp)
1200 je nmi_debug_stack_check
1201nmi_stack_correct:
1202 /* We have a RING0_INT_FRAME here */
1203 pushl %eax
1204 CFI_ADJUST_CFA_OFFSET 4
1205 SAVE_ALL
1206 TRACE_IRQS_OFF
1207 xorl %edx,%edx # zero error code
1208 movl %esp,%eax # pt_regs pointer
1209 call do_nmi
1210 jmp restore_nocheck_notrace
1211 CFI_ENDPROC
1212
1213nmi_stack_fixup:
1214 RING0_INT_FRAME
1215 FIX_STACK(12,nmi_stack_correct, 1)
1216 jmp nmi_stack_correct
1217
1218nmi_debug_stack_check:
1219 /* We have a RING0_INT_FRAME here */
1220 cmpw $__KERNEL_CS,16(%esp)
1221 jne nmi_stack_correct
1222 cmpl $debug,(%esp)
1223 jb nmi_stack_correct
1224 cmpl $debug_esp_fix_insn,(%esp)
1225 ja nmi_stack_correct
1226 FIX_STACK(24,nmi_stack_correct, 1)
1227 jmp nmi_stack_correct
1228
1229nmi_espfix_stack:
1230 /* We have a RING0_INT_FRAME here.
1231 *
1232 * create the pointer to lss back
1233 */
1234 pushl %ss
1235 CFI_ADJUST_CFA_OFFSET 4
1236 pushl %esp
1237 CFI_ADJUST_CFA_OFFSET 4
1238 addw $4, (%esp)
1239 /* copy the iret frame of 12 bytes */
1240 .rept 3
1241 pushl 16(%esp)
1242 CFI_ADJUST_CFA_OFFSET 4
1243 .endr
1244 pushl %eax
1245 CFI_ADJUST_CFA_OFFSET 4
1246 SAVE_ALL
1247 TRACE_IRQS_OFF
1248 FIXUP_ESPFIX_STACK # %eax == %esp
1249 xorl %edx,%edx # zero error code
1250 call do_nmi
1251 RESTORE_REGS
1252 lss 12+4(%esp), %esp # back to espfix stack
1253 CFI_ADJUST_CFA_OFFSET -24
1254 jmp irq_return
1255 CFI_ENDPROC
1256END(nmi)
1257
1258ENTRY(int3)
1259 RING0_INT_FRAME
1260 pushl $-1 # mark this as an int
1261 CFI_ADJUST_CFA_OFFSET 4
1262 SAVE_ALL
1263 TRACE_IRQS_OFF
1264 xorl %edx,%edx # zero error code
1265 movl %esp,%eax # pt_regs pointer
1266 call do_int3
1267 jmp ret_from_exception
1268 CFI_ENDPROC
1269END(int3)
1270
1271ENTRY(general_protection)
1272 RING0_EC_FRAME
1273 pushl $do_general_protection
1274 CFI_ADJUST_CFA_OFFSET 4
1275 jmp error_code
1276 CFI_ENDPROC
1277END(general_protection)
1278
1279/*
1280 * End of kprobes section
1281 */
1282 .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 54e0bbdccb99..e28c7a987793 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -11,15 +11,15 @@
11 * 11 *
12 * NOTE: This code handles signal-recognition, which happens every time 12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call. 13 * after an interrupt and after each system call.
14 * 14 *
15 * Normal syscalls and interrupts don't save a full stack frame, this is 15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al. 16 * only done for syscall tracing, signals or fork/exec et.al.
17 * 17 *
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
25 * - CFI macros are used to generate dwarf2 unwind information for better 25 * - CFI macros are used to generate dwarf2 unwind information for better
@@ -60,7 +60,6 @@
60#define __AUDIT_ARCH_LE 0x40000000 60#define __AUDIT_ARCH_LE 0x40000000
61 61
62 .code64 62 .code64
63
64#ifdef CONFIG_FUNCTION_TRACER 63#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 64#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 65ENTRY(mcount)
@@ -71,15 +70,7 @@ ENTRY(ftrace_caller)
71 cmpl $0, function_trace_stop 70 cmpl $0, function_trace_stop
72 jne ftrace_stub 71 jne ftrace_stub
73 72
74 /* taken from glibc */ 73 MCOUNT_SAVE_FRAME
75 subq $0x38, %rsp
76 movq %rax, (%rsp)
77 movq %rcx, 8(%rsp)
78 movq %rdx, 16(%rsp)
79 movq %rsi, 24(%rsp)
80 movq %rdi, 32(%rsp)
81 movq %r8, 40(%rsp)
82 movq %r9, 48(%rsp)
83 74
84 movq 0x38(%rsp), %rdi 75 movq 0x38(%rsp), %rdi
85 movq 8(%rbp), %rsi 76 movq 8(%rbp), %rsi
@@ -89,14 +80,7 @@ ENTRY(ftrace_caller)
89ftrace_call: 80ftrace_call:
90 call ftrace_stub 81 call ftrace_stub
91 82
92 movq 48(%rsp), %r9 83 MCOUNT_RESTORE_FRAME
93 movq 40(%rsp), %r8
94 movq 32(%rsp), %rdi
95 movq 24(%rsp), %rsi
96 movq 16(%rsp), %rdx
97 movq 8(%rsp), %rcx
98 movq (%rsp), %rax
99 addq $0x38, %rsp
100 84
101#ifdef CONFIG_FUNCTION_GRAPH_TRACER 85#ifdef CONFIG_FUNCTION_GRAPH_TRACER
102.globl ftrace_graph_call 86.globl ftrace_graph_call
@@ -130,15 +114,7 @@ ftrace_stub:
130 retq 114 retq
131 115
132trace: 116trace:
133 /* taken from glibc */ 117 MCOUNT_SAVE_FRAME
134 subq $0x38, %rsp
135 movq %rax, (%rsp)
136 movq %rcx, 8(%rsp)
137 movq %rdx, 16(%rsp)
138 movq %rsi, 24(%rsp)
139 movq %rdi, 32(%rsp)
140 movq %r8, 40(%rsp)
141 movq %r9, 48(%rsp)
142 118
143 movq 0x38(%rsp), %rdi 119 movq 0x38(%rsp), %rdi
144 movq 8(%rbp), %rsi 120 movq 8(%rbp), %rsi
@@ -146,14 +122,7 @@ trace:
146 122
147 call *ftrace_trace_function 123 call *ftrace_trace_function
148 124
149 movq 48(%rsp), %r9 125 MCOUNT_RESTORE_FRAME
150 movq 40(%rsp), %r8
151 movq 32(%rsp), %rdi
152 movq 24(%rsp), %rsi
153 movq 16(%rsp), %rdx
154 movq 8(%rsp), %rcx
155 movq (%rsp), %rax
156 addq $0x38, %rsp
157 126
158 jmp ftrace_stub 127 jmp ftrace_stub
159END(mcount) 128END(mcount)
@@ -165,14 +134,7 @@ ENTRY(ftrace_graph_caller)
165 cmpl $0, function_trace_stop 134 cmpl $0, function_trace_stop
166 jne ftrace_stub 135 jne ftrace_stub
167 136
168 subq $0x38, %rsp 137 MCOUNT_SAVE_FRAME
169 movq %rax, (%rsp)
170 movq %rcx, 8(%rsp)
171 movq %rdx, 16(%rsp)
172 movq %rsi, 24(%rsp)
173 movq %rdi, 32(%rsp)
174 movq %r8, 40(%rsp)
175 movq %r9, 48(%rsp)
176 138
177 leaq 8(%rbp), %rdi 139 leaq 8(%rbp), %rdi
178 movq 0x38(%rsp), %rsi 140 movq 0x38(%rsp), %rsi
@@ -180,14 +142,8 @@ ENTRY(ftrace_graph_caller)
180 142
181 call prepare_ftrace_return 143 call prepare_ftrace_return
182 144
183 movq 48(%rsp), %r9 145 MCOUNT_RESTORE_FRAME
184 movq 40(%rsp), %r8 146
185 movq 32(%rsp), %rdi
186 movq 24(%rsp), %rsi
187 movq 16(%rsp), %rdx
188 movq 8(%rsp), %rcx
189 movq (%rsp), %rax
190 addq $0x38, %rsp
191 retq 147 retq
192END(ftrace_graph_caller) 148END(ftrace_graph_caller)
193 149
@@ -225,7 +181,7 @@ return_to_handler:
225 181
226#ifndef CONFIG_PREEMPT 182#ifndef CONFIG_PREEMPT
227#define retint_kernel retint_restore_args 183#define retint_kernel retint_restore_args
228#endif 184#endif
229 185
230#ifdef CONFIG_PARAVIRT 186#ifdef CONFIG_PARAVIRT
231ENTRY(native_usergs_sysret64) 187ENTRY(native_usergs_sysret64)
@@ -244,29 +200,29 @@ ENTRY(native_usergs_sysret64)
244.endm 200.endm
245 201
246/* 202/*
247 * C code is not supposed to know about undefined top of stack. Every time 203 * C code is not supposed to know about undefined top of stack. Every time
248 * a C function with an pt_regs argument is called from the SYSCALL based 204 * a C function with an pt_regs argument is called from the SYSCALL based
249 * fast path FIXUP_TOP_OF_STACK is needed. 205 * fast path FIXUP_TOP_OF_STACK is needed.
250 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs 206 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
251 * manipulation. 207 * manipulation.
252 */ 208 */
253 209
254 /* %rsp:at FRAMEEND */ 210 /* %rsp:at FRAMEEND */
255 .macro FIXUP_TOP_OF_STACK tmp 211 .macro FIXUP_TOP_OF_STACK tmp offset=0
256 movq %gs:pda_oldrsp,\tmp 212 movq %gs:pda_oldrsp,\tmp
257 movq \tmp,RSP(%rsp) 213 movq \tmp,RSP+\offset(%rsp)
258 movq $__USER_DS,SS(%rsp) 214 movq $__USER_DS,SS+\offset(%rsp)
259 movq $__USER_CS,CS(%rsp) 215 movq $__USER_CS,CS+\offset(%rsp)
260 movq $-1,RCX(%rsp) 216 movq $-1,RCX+\offset(%rsp)
261 movq R11(%rsp),\tmp /* get eflags */ 217 movq R11+\offset(%rsp),\tmp /* get eflags */
262 movq \tmp,EFLAGS(%rsp) 218 movq \tmp,EFLAGS+\offset(%rsp)
263 .endm 219 .endm
264 220
265 .macro RESTORE_TOP_OF_STACK tmp,offset=0 221 .macro RESTORE_TOP_OF_STACK tmp offset=0
266 movq RSP-\offset(%rsp),\tmp 222 movq RSP+\offset(%rsp),\tmp
267 movq \tmp,%gs:pda_oldrsp 223 movq \tmp,%gs:pda_oldrsp
268 movq EFLAGS-\offset(%rsp),\tmp 224 movq EFLAGS+\offset(%rsp),\tmp
269 movq \tmp,R11-\offset(%rsp) 225 movq \tmp,R11+\offset(%rsp)
270 .endm 226 .endm
271 227
272 .macro FAKE_STACK_FRAME child_rip 228 .macro FAKE_STACK_FRAME child_rip
@@ -278,7 +234,7 @@ ENTRY(native_usergs_sysret64)
278 pushq %rax /* rsp */ 234 pushq %rax /* rsp */
279 CFI_ADJUST_CFA_OFFSET 8 235 CFI_ADJUST_CFA_OFFSET 8
280 CFI_REL_OFFSET rsp,0 236 CFI_REL_OFFSET rsp,0
281 pushq $(1<<9) /* eflags - interrupts on */ 237 pushq $X86_EFLAGS_IF /* eflags - interrupts on */
282 CFI_ADJUST_CFA_OFFSET 8 238 CFI_ADJUST_CFA_OFFSET 8
283 /*CFI_REL_OFFSET rflags,0*/ 239 /*CFI_REL_OFFSET rflags,0*/
284 pushq $__KERNEL_CS /* cs */ 240 pushq $__KERNEL_CS /* cs */
@@ -296,62 +252,184 @@ ENTRY(native_usergs_sysret64)
296 CFI_ADJUST_CFA_OFFSET -(6*8) 252 CFI_ADJUST_CFA_OFFSET -(6*8)
297 .endm 253 .endm
298 254
299 .macro CFI_DEFAULT_STACK start=1 255/*
256 * initial frame state for interrupts (and exceptions without error code)
257 */
258 .macro EMPTY_FRAME start=1 offset=0
300 .if \start 259 .if \start
301 CFI_STARTPROC simple 260 CFI_STARTPROC simple
302 CFI_SIGNAL_FRAME 261 CFI_SIGNAL_FRAME
303 CFI_DEF_CFA rsp,SS+8 262 CFI_DEF_CFA rsp,8+\offset
304 .else 263 .else
305 CFI_DEF_CFA_OFFSET SS+8 264 CFI_DEF_CFA_OFFSET 8+\offset
306 .endif 265 .endif
307 CFI_REL_OFFSET r15,R15
308 CFI_REL_OFFSET r14,R14
309 CFI_REL_OFFSET r13,R13
310 CFI_REL_OFFSET r12,R12
311 CFI_REL_OFFSET rbp,RBP
312 CFI_REL_OFFSET rbx,RBX
313 CFI_REL_OFFSET r11,R11
314 CFI_REL_OFFSET r10,R10
315 CFI_REL_OFFSET r9,R9
316 CFI_REL_OFFSET r8,R8
317 CFI_REL_OFFSET rax,RAX
318 CFI_REL_OFFSET rcx,RCX
319 CFI_REL_OFFSET rdx,RDX
320 CFI_REL_OFFSET rsi,RSI
321 CFI_REL_OFFSET rdi,RDI
322 CFI_REL_OFFSET rip,RIP
323 /*CFI_REL_OFFSET cs,CS*/
324 /*CFI_REL_OFFSET rflags,EFLAGS*/
325 CFI_REL_OFFSET rsp,RSP
326 /*CFI_REL_OFFSET ss,SS*/
327 .endm 266 .endm
267
328/* 268/*
329 * A newly forked process directly context switches into this. 269 * initial frame state for interrupts (and exceptions without error code)
330 */ 270 */
331/* rdi: prev */ 271 .macro INTR_FRAME start=1 offset=0
272 EMPTY_FRAME \start, SS+8+\offset-RIP
273 /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
274 CFI_REL_OFFSET rsp, RSP+\offset-RIP
275 /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
276 /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
277 CFI_REL_OFFSET rip, RIP+\offset-RIP
278 .endm
279
280/*
281 * initial frame state for exceptions with error code (and interrupts
282 * with vector already pushed)
283 */
284 .macro XCPT_FRAME start=1 offset=0
285 INTR_FRAME \start, RIP+\offset-ORIG_RAX
286 /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
287 .endm
288
289/*
290 * frame that enables calling into C.
291 */
292 .macro PARTIAL_FRAME start=1 offset=0
293 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
294 CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
295 CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
296 CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
297 CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
298 CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
299 CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
300 CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
301 CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
302 CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
303 .endm
304
305/*
306 * frame that enables passing a complete pt_regs to a C function.
307 */
308 .macro DEFAULT_FRAME start=1 offset=0
309 PARTIAL_FRAME \start, R11+\offset-R15
310 CFI_REL_OFFSET rbx, RBX+\offset
311 CFI_REL_OFFSET rbp, RBP+\offset
312 CFI_REL_OFFSET r12, R12+\offset
313 CFI_REL_OFFSET r13, R13+\offset
314 CFI_REL_OFFSET r14, R14+\offset
315 CFI_REL_OFFSET r15, R15+\offset
316 .endm
317
318/* save partial stack frame */
319ENTRY(save_args)
320 XCPT_FRAME
321 cld
322 movq_cfi rdi, RDI+16-ARGOFFSET
323 movq_cfi rsi, RSI+16-ARGOFFSET
324 movq_cfi rdx, RDX+16-ARGOFFSET
325 movq_cfi rcx, RCX+16-ARGOFFSET
326 movq_cfi rax, RAX+16-ARGOFFSET
327 movq_cfi r8, R8+16-ARGOFFSET
328 movq_cfi r9, R9+16-ARGOFFSET
329 movq_cfi r10, R10+16-ARGOFFSET
330 movq_cfi r11, R11+16-ARGOFFSET
331
332 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
333 movq_cfi rbp, 8 /* push %rbp */
334 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
335 testl $3, CS(%rdi)
336 je 1f
337 SWAPGS
338 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work)
343 */
3441: incl %gs:pda_irqcount
345 jne 2f
346 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp
348 EMPTY_FRAME 0
349 pushq_cfi %rax /* ... to the new stack */
350 /*
351 * We entered an interrupt context - irqs are off:
352 */
3532: TRACE_IRQS_OFF
354 ret
355 CFI_ENDPROC
356END(save_args)
357
358ENTRY(save_rest)
359 PARTIAL_FRAME 1 REST_SKIP+8
360 movq 5*8+16(%rsp), %r11 /* save return address */
361 movq_cfi rbx, RBX+16
362 movq_cfi rbp, RBP+16
363 movq_cfi r12, R12+16
364 movq_cfi r13, R13+16
365 movq_cfi r14, R14+16
366 movq_cfi r15, R15+16
367 movq %r11, 8(%rsp) /* return address */
368 FIXUP_TOP_OF_STACK %r11, 16
369 ret
370 CFI_ENDPROC
371END(save_rest)
372
373/* save complete stack frame */
374ENTRY(save_paranoid)
375 XCPT_FRAME 1 RDI+8
376 cld
377 movq_cfi rdi, RDI+8
378 movq_cfi rsi, RSI+8
379 movq_cfi rdx, RDX+8
380 movq_cfi rcx, RCX+8
381 movq_cfi rax, RAX+8
382 movq_cfi r8, R8+8
383 movq_cfi r9, R9+8
384 movq_cfi r10, R10+8
385 movq_cfi r11, R11+8
386 movq_cfi rbx, RBX+8
387 movq_cfi rbp, RBP+8
388 movq_cfi r12, R12+8
389 movq_cfi r13, R13+8
390 movq_cfi r14, R14+8
391 movq_cfi r15, R15+8
392 movl $1,%ebx
393 movl $MSR_GS_BASE,%ecx
394 rdmsr
395 testl %edx,%edx
396 js 1f /* negative -> in kernel */
397 SWAPGS
398 xorl %ebx,%ebx
3991: ret
400 CFI_ENDPROC
401END(save_paranoid)
402
403/*
404 * A newly forked process directly context switches into this address.
405 *
406 * rdi: prev task we switched from
407 */
332ENTRY(ret_from_fork) 408ENTRY(ret_from_fork)
333 CFI_DEFAULT_STACK 409 DEFAULT_FRAME
410
334 push kernel_eflags(%rip) 411 push kernel_eflags(%rip)
335 CFI_ADJUST_CFA_OFFSET 8 412 CFI_ADJUST_CFA_OFFSET 8
336 popf # reset kernel eflags 413 popf # reset kernel eflags
337 CFI_ADJUST_CFA_OFFSET -8 414 CFI_ADJUST_CFA_OFFSET -8
338 call schedule_tail 415
416 call schedule_tail # rdi: 'prev' task parameter
417
339 GET_THREAD_INFO(%rcx) 418 GET_THREAD_INFO(%rcx)
340 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 419
341 jnz rff_trace 420 CFI_REMEMBER_STATE
342rff_action:
343 RESTORE_REST 421 RESTORE_REST
344 testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? 422
423 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
345 je int_ret_from_sys_call 424 je int_ret_from_sys_call
346 testl $_TIF_IA32,TI_flags(%rcx) 425
426 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
347 jnz int_ret_from_sys_call 427 jnz int_ret_from_sys_call
348 RESTORE_TOP_OF_STACK %rdi,ARGOFFSET 428
349 jmp ret_from_sys_call 429 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
350rff_trace: 430 jmp ret_from_sys_call # go to the SYSRET fastpath
351 movq %rsp,%rdi 431
352 call syscall_trace_leave 432 CFI_RESTORE_STATE
353 GET_THREAD_INFO(%rcx)
354 jmp rff_action
355 CFI_ENDPROC 433 CFI_ENDPROC
356END(ret_from_fork) 434END(ret_from_fork)
357 435
@@ -361,20 +439,20 @@ END(ret_from_fork)
361 * SYSCALL does not save anything on the stack and does not change the 439 * SYSCALL does not save anything on the stack and does not change the
362 * stack pointer. 440 * stack pointer.
363 */ 441 */
364 442
365/* 443/*
366 * Register setup: 444 * Register setup:
367 * rax system call number 445 * rax system call number
368 * rdi arg0 446 * rdi arg0
369 * rcx return address for syscall/sysret, C arg3 447 * rcx return address for syscall/sysret, C arg3
370 * rsi arg1 448 * rsi arg1
371 * rdx arg2 449 * rdx arg2
372 * r10 arg3 (--> moved to rcx for C) 450 * r10 arg3 (--> moved to rcx for C)
373 * r8 arg4 451 * r8 arg4
374 * r9 arg5 452 * r9 arg5
375 * r11 eflags for syscall/sysret, temporary for C 453 * r11 eflags for syscall/sysret, temporary for C
376 * r12-r15,rbp,rbx saved by C code, not touched. 454 * r12-r15,rbp,rbx saved by C code, not touched.
377 * 455 *
378 * Interrupts are off on entry. 456 * Interrupts are off on entry.
379 * Only called from user space. 457 * Only called from user space.
380 * 458 *
@@ -384,7 +462,7 @@ END(ret_from_fork)
384 * When user can change the frames always force IRET. That is because 462 * When user can change the frames always force IRET. That is because
385 * it deals with uncanonical addresses better. SYSRET has trouble 463 * it deals with uncanonical addresses better. SYSRET has trouble
386 * with them due to bugs in both AMD and Intel CPUs. 464 * with them due to bugs in both AMD and Intel CPUs.
387 */ 465 */
388 466
389ENTRY(system_call) 467ENTRY(system_call)
390 CFI_STARTPROC simple 468 CFI_STARTPROC simple
@@ -400,7 +478,7 @@ ENTRY(system_call)
400 */ 478 */
401ENTRY(system_call_after_swapgs) 479ENTRY(system_call_after_swapgs)
402 480
403 movq %rsp,%gs:pda_oldrsp 481 movq %rsp,%gs:pda_oldrsp
404 movq %gs:pda_kernelstack,%rsp 482 movq %gs:pda_kernelstack,%rsp
405 /* 483 /*
406 * No need to follow this irqs off/on section - it's straight 484 * No need to follow this irqs off/on section - it's straight
@@ -408,7 +486,7 @@ ENTRY(system_call_after_swapgs)
408 */ 486 */
409 ENABLE_INTERRUPTS(CLBR_NONE) 487 ENABLE_INTERRUPTS(CLBR_NONE)
410 SAVE_ARGS 8,1 488 SAVE_ARGS 8,1
411 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 489 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
412 movq %rcx,RIP-ARGOFFSET(%rsp) 490 movq %rcx,RIP-ARGOFFSET(%rsp)
413 CFI_REL_OFFSET rip,RIP-ARGOFFSET 491 CFI_REL_OFFSET rip,RIP-ARGOFFSET
414 GET_THREAD_INFO(%rcx) 492 GET_THREAD_INFO(%rcx)
@@ -422,19 +500,19 @@ system_call_fastpath:
422 movq %rax,RAX-ARGOFFSET(%rsp) 500 movq %rax,RAX-ARGOFFSET(%rsp)
423/* 501/*
424 * Syscall return path ending with SYSRET (fast path) 502 * Syscall return path ending with SYSRET (fast path)
425 * Has incomplete stack frame and undefined top of stack. 503 * Has incomplete stack frame and undefined top of stack.
426 */ 504 */
427ret_from_sys_call: 505ret_from_sys_call:
428 movl $_TIF_ALLWORK_MASK,%edi 506 movl $_TIF_ALLWORK_MASK,%edi
429 /* edi: flagmask */ 507 /* edi: flagmask */
430sysret_check: 508sysret_check:
431 LOCKDEP_SYS_EXIT 509 LOCKDEP_SYS_EXIT
432 GET_THREAD_INFO(%rcx) 510 GET_THREAD_INFO(%rcx)
433 DISABLE_INTERRUPTS(CLBR_NONE) 511 DISABLE_INTERRUPTS(CLBR_NONE)
434 TRACE_IRQS_OFF 512 TRACE_IRQS_OFF
435 movl TI_flags(%rcx),%edx 513 movl TI_flags(%rcx),%edx
436 andl %edi,%edx 514 andl %edi,%edx
437 jnz sysret_careful 515 jnz sysret_careful
438 CFI_REMEMBER_STATE 516 CFI_REMEMBER_STATE
439 /* 517 /*
440 * sysretq will re-enable interrupts: 518 * sysretq will re-enable interrupts:
@@ -449,7 +527,7 @@ sysret_check:
449 527
450 CFI_RESTORE_STATE 528 CFI_RESTORE_STATE
451 /* Handle reschedules */ 529 /* Handle reschedules */
452 /* edx: work, edi: workmask */ 530 /* edx: work, edi: workmask */
453sysret_careful: 531sysret_careful:
454 bt $TIF_NEED_RESCHED,%edx 532 bt $TIF_NEED_RESCHED,%edx
455 jnc sysret_signal 533 jnc sysret_signal
@@ -462,7 +540,7 @@ sysret_careful:
462 CFI_ADJUST_CFA_OFFSET -8 540 CFI_ADJUST_CFA_OFFSET -8
463 jmp sysret_check 541 jmp sysret_check
464 542
465 /* Handle a signal */ 543 /* Handle a signal */
466sysret_signal: 544sysret_signal:
467 TRACE_IRQS_ON 545 TRACE_IRQS_ON
468 ENABLE_INTERRUPTS(CLBR_NONE) 546 ENABLE_INTERRUPTS(CLBR_NONE)
@@ -471,17 +549,20 @@ sysret_signal:
471 jc sysret_audit 549 jc sysret_audit
472#endif 550#endif
473 /* edx: work flags (arg3) */ 551 /* edx: work flags (arg3) */
474 leaq do_notify_resume(%rip),%rax
475 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 552 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
476 xorl %esi,%esi # oldset -> arg2 553 xorl %esi,%esi # oldset -> arg2
477 call ptregscall_common 554 SAVE_REST
555 FIXUP_TOP_OF_STACK %r11
556 call do_notify_resume
557 RESTORE_TOP_OF_STACK %r11
558 RESTORE_REST
478 movl $_TIF_WORK_MASK,%edi 559 movl $_TIF_WORK_MASK,%edi
479 /* Use IRET because user could have changed frame. This 560 /* Use IRET because user could have changed frame. This
480 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 561 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
481 DISABLE_INTERRUPTS(CLBR_NONE) 562 DISABLE_INTERRUPTS(CLBR_NONE)
482 TRACE_IRQS_OFF 563 TRACE_IRQS_OFF
483 jmp int_with_check 564 jmp int_with_check
484 565
485badsys: 566badsys:
486 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 567 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
487 jmp ret_from_sys_call 568 jmp ret_from_sys_call
@@ -520,7 +601,7 @@ sysret_audit:
520#endif /* CONFIG_AUDITSYSCALL */ 601#endif /* CONFIG_AUDITSYSCALL */
521 602
522 /* Do syscall tracing */ 603 /* Do syscall tracing */
523tracesys: 604tracesys:
524#ifdef CONFIG_AUDITSYSCALL 605#ifdef CONFIG_AUDITSYSCALL
525 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 606 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
526 jz auditsys 607 jz auditsys
@@ -543,8 +624,8 @@ tracesys:
543 call *sys_call_table(,%rax,8) 624 call *sys_call_table(,%rax,8)
544 movq %rax,RAX-ARGOFFSET(%rsp) 625 movq %rax,RAX-ARGOFFSET(%rsp)
545 /* Use IRET because user could have changed frame */ 626 /* Use IRET because user could have changed frame */
546 627
547/* 628/*
548 * Syscall return path ending with IRET. 629 * Syscall return path ending with IRET.
549 * Has correct top of stack, but partial stack frame. 630 * Has correct top of stack, but partial stack frame.
550 */ 631 */
@@ -588,18 +669,18 @@ int_very_careful:
588 TRACE_IRQS_ON 669 TRACE_IRQS_ON
589 ENABLE_INTERRUPTS(CLBR_NONE) 670 ENABLE_INTERRUPTS(CLBR_NONE)
590 SAVE_REST 671 SAVE_REST
591 /* Check for syscall exit trace */ 672 /* Check for syscall exit trace */
592 testl $_TIF_WORK_SYSCALL_EXIT,%edx 673 testl $_TIF_WORK_SYSCALL_EXIT,%edx
593 jz int_signal 674 jz int_signal
594 pushq %rdi 675 pushq %rdi
595 CFI_ADJUST_CFA_OFFSET 8 676 CFI_ADJUST_CFA_OFFSET 8
596 leaq 8(%rsp),%rdi # &ptregs -> arg1 677 leaq 8(%rsp),%rdi # &ptregs -> arg1
597 call syscall_trace_leave 678 call syscall_trace_leave
598 popq %rdi 679 popq %rdi
599 CFI_ADJUST_CFA_OFFSET -8 680 CFI_ADJUST_CFA_OFFSET -8
600 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 681 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
601 jmp int_restore_rest 682 jmp int_restore_rest
602 683
603int_signal: 684int_signal:
604 testl $_TIF_DO_NOTIFY_MASK,%edx 685 testl $_TIF_DO_NOTIFY_MASK,%edx
605 jz 1f 686 jz 1f
@@ -614,22 +695,24 @@ int_restore_rest:
614 jmp int_with_check 695 jmp int_with_check
615 CFI_ENDPROC 696 CFI_ENDPROC
616END(system_call) 697END(system_call)
617 698
618/* 699/*
619 * Certain special system calls that need to save a complete full stack frame. 700 * Certain special system calls that need to save a complete full stack frame.
620 */ 701 */
621
622 .macro PTREGSCALL label,func,arg 702 .macro PTREGSCALL label,func,arg
623 .globl \label 703ENTRY(\label)
624\label: 704 PARTIAL_FRAME 1 8 /* offset 8: return address */
625 leaq \func(%rip),%rax 705 subq $REST_SKIP, %rsp
626 leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ 706 CFI_ADJUST_CFA_OFFSET REST_SKIP
627 jmp ptregscall_common 707 call save_rest
708 DEFAULT_FRAME 0 8 /* offset 8: return address */
709 leaq 8(%rsp), \arg /* pt_regs pointer */
710 call \func
711 jmp ptregscall_common
712 CFI_ENDPROC
628END(\label) 713END(\label)
629 .endm 714 .endm
630 715
631 CFI_STARTPROC
632
633 PTREGSCALL stub_clone, sys_clone, %r8 716 PTREGSCALL stub_clone, sys_clone, %r8
634 PTREGSCALL stub_fork, sys_fork, %rdi 717 PTREGSCALL stub_fork, sys_fork, %rdi
635 PTREGSCALL stub_vfork, sys_vfork, %rdi 718 PTREGSCALL stub_vfork, sys_vfork, %rdi
@@ -637,25 +720,18 @@ END(\label)
637 PTREGSCALL stub_iopl, sys_iopl, %rsi 720 PTREGSCALL stub_iopl, sys_iopl, %rsi
638 721
639ENTRY(ptregscall_common) 722ENTRY(ptregscall_common)
640 popq %r11 723 DEFAULT_FRAME 1 8 /* offset 8: return address */
641 CFI_ADJUST_CFA_OFFSET -8 724 RESTORE_TOP_OF_STACK %r11, 8
642 CFI_REGISTER rip, r11 725 movq_cfi_restore R15+8, r15
643 SAVE_REST 726 movq_cfi_restore R14+8, r14
644 movq %r11, %r15 727 movq_cfi_restore R13+8, r13
645 CFI_REGISTER rip, r15 728 movq_cfi_restore R12+8, r12
646 FIXUP_TOP_OF_STACK %r11 729 movq_cfi_restore RBP+8, rbp
647 call *%rax 730 movq_cfi_restore RBX+8, rbx
648 RESTORE_TOP_OF_STACK %r11 731 ret $REST_SKIP /* pop extended registers */
649 movq %r15, %r11
650 CFI_REGISTER rip, r11
651 RESTORE_REST
652 pushq %r11
653 CFI_ADJUST_CFA_OFFSET 8
654 CFI_REL_OFFSET rip, 0
655 ret
656 CFI_ENDPROC 732 CFI_ENDPROC
657END(ptregscall_common) 733END(ptregscall_common)
658 734
659ENTRY(stub_execve) 735ENTRY(stub_execve)
660 CFI_STARTPROC 736 CFI_STARTPROC
661 popq %r11 737 popq %r11
@@ -671,11 +747,11 @@ ENTRY(stub_execve)
671 jmp int_ret_from_sys_call 747 jmp int_ret_from_sys_call
672 CFI_ENDPROC 748 CFI_ENDPROC
673END(stub_execve) 749END(stub_execve)
674 750
675/* 751/*
676 * sigreturn is special because it needs to restore all registers on return. 752 * sigreturn is special because it needs to restore all registers on return.
677 * This cannot be done with SYSRET, so use the IRET return path instead. 753 * This cannot be done with SYSRET, so use the IRET return path instead.
678 */ 754 */
679ENTRY(stub_rt_sigreturn) 755ENTRY(stub_rt_sigreturn)
680 CFI_STARTPROC 756 CFI_STARTPROC
681 addq $8, %rsp 757 addq $8, %rsp
@@ -691,70 +767,70 @@ ENTRY(stub_rt_sigreturn)
691END(stub_rt_sigreturn) 767END(stub_rt_sigreturn)
692 768
693/* 769/*
694 * initial frame state for interrupts and exceptions 770 * Build the entry stubs and pointer table with some assembler magic.
771 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
772 * single cache line on all modern x86 implementations.
695 */ 773 */
696 .macro _frame ref 774 .section .init.rodata,"a"
697 CFI_STARTPROC simple 775ENTRY(interrupt)
698 CFI_SIGNAL_FRAME 776 .text
699 CFI_DEF_CFA rsp,SS+8-\ref 777 .p2align 5
700 /*CFI_REL_OFFSET ss,SS-\ref*/ 778 .p2align CONFIG_X86_L1_CACHE_SHIFT
701 CFI_REL_OFFSET rsp,RSP-\ref 779ENTRY(irq_entries_start)
702 /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ 780 INTR_FRAME
703 /*CFI_REL_OFFSET cs,CS-\ref*/ 781vector=FIRST_EXTERNAL_VECTOR
704 CFI_REL_OFFSET rip,RIP-\ref 782.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
705 .endm 783 .balign 32
784 .rept 7
785 .if vector < NR_VECTORS
786 .if vector <> FIRST_EXTERNAL_VECTOR
787 CFI_ADJUST_CFA_OFFSET -8
788 .endif
7891: pushq $(~vector+0x80) /* Note: always in signed byte range */
790 CFI_ADJUST_CFA_OFFSET 8
791 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
792 jmp 2f
793 .endif
794 .previous
795 .quad 1b
796 .text
797vector=vector+1
798 .endif
799 .endr
8002: jmp common_interrupt
801.endr
802 CFI_ENDPROC
803END(irq_entries_start)
706 804
707/* initial frame state for interrupts (and exceptions without error code) */ 805.previous
708#define INTR_FRAME _frame RIP 806END(interrupt)
709/* initial frame state for exceptions with error code (and interrupts with 807.previous
710 vector already pushed) */
711#define XCPT_FRAME _frame ORIG_RAX
712 808
713/* 809/*
714 * Interrupt entry/exit. 810 * Interrupt entry/exit.
715 * 811 *
716 * Interrupt entry points save only callee clobbered registers in fast path. 812 * Interrupt entry points save only callee clobbered registers in fast path.
717 * 813 *
718 * Entry runs with interrupts off. 814 * Entry runs with interrupts off.
719 */ 815 */
720 816
721/* 0(%rsp): interrupt number */ 817/* 0(%rsp): ~(interrupt number) */
722 .macro interrupt func 818 .macro interrupt func
723 cld 819 subq $10*8, %rsp
724 SAVE_ARGS 820 CFI_ADJUST_CFA_OFFSET 10*8
725 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 821 call save_args
726 pushq %rbp 822 PARTIAL_FRAME 0
727 /*
728 * Save rbp twice: One is for marking the stack frame, as usual, and the
729 * other, to fill pt_regs properly. This is because bx comes right
730 * before the last saved register in that structure, and not bp. If the
731 * base pointer were in the place bx is today, this would not be needed.
732 */
733 movq %rbp, -8(%rsp)
734 CFI_ADJUST_CFA_OFFSET 8
735 CFI_REL_OFFSET rbp, 0
736 movq %rsp,%rbp
737 CFI_DEF_CFA_REGISTER rbp
738 testl $3,CS(%rdi)
739 je 1f
740 SWAPGS
741 /* irqcount is used to check if a CPU is already on an interrupt
742 stack or not. While this is essentially redundant with preempt_count
743 it is a little cheaper to use a separate counter in the PDA
744 (short of moving irq_enter into assembly, which would be too
745 much work) */
7461: incl %gs:pda_irqcount
747 cmoveq %gs:pda_irqstackptr,%rsp
748 push %rbp # backlink for old unwinder
749 /*
750 * We entered an interrupt context - irqs are off:
751 */
752 TRACE_IRQS_OFF
753 call \func 823 call \func
754 .endm 824 .endm
755 825
756ENTRY(common_interrupt) 826 /*
827 * The interrupt stubs push (~vector+0x80) onto the stack and
828 * then jump to common_interrupt.
829 */
830 .p2align CONFIG_X86_L1_CACHE_SHIFT
831common_interrupt:
757 XCPT_FRAME 832 XCPT_FRAME
833 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
758 interrupt do_IRQ 834 interrupt do_IRQ
759 /* 0(%rsp): oldrsp-ARGOFFSET */ 835 /* 0(%rsp): oldrsp-ARGOFFSET */
760ret_from_intr: 836ret_from_intr:
@@ -768,12 +844,12 @@ exit_intr:
768 GET_THREAD_INFO(%rcx) 844 GET_THREAD_INFO(%rcx)
769 testl $3,CS-ARGOFFSET(%rsp) 845 testl $3,CS-ARGOFFSET(%rsp)
770 je retint_kernel 846 je retint_kernel
771 847
772 /* Interrupt came from user space */ 848 /* Interrupt came from user space */
773 /* 849 /*
774 * Has a correct top of stack, but a partial stack frame 850 * Has a correct top of stack, but a partial stack frame
775 * %rcx: thread info. Interrupts off. 851 * %rcx: thread info. Interrupts off.
776 */ 852 */
777retint_with_reschedule: 853retint_with_reschedule:
778 movl $_TIF_WORK_MASK,%edi 854 movl $_TIF_WORK_MASK,%edi
779retint_check: 855retint_check:
@@ -846,20 +922,20 @@ retint_careful:
846 pushq %rdi 922 pushq %rdi
847 CFI_ADJUST_CFA_OFFSET 8 923 CFI_ADJUST_CFA_OFFSET 8
848 call schedule 924 call schedule
849 popq %rdi 925 popq %rdi
850 CFI_ADJUST_CFA_OFFSET -8 926 CFI_ADJUST_CFA_OFFSET -8
851 GET_THREAD_INFO(%rcx) 927 GET_THREAD_INFO(%rcx)
852 DISABLE_INTERRUPTS(CLBR_NONE) 928 DISABLE_INTERRUPTS(CLBR_NONE)
853 TRACE_IRQS_OFF 929 TRACE_IRQS_OFF
854 jmp retint_check 930 jmp retint_check
855 931
856retint_signal: 932retint_signal:
857 testl $_TIF_DO_NOTIFY_MASK,%edx 933 testl $_TIF_DO_NOTIFY_MASK,%edx
858 jz retint_swapgs 934 jz retint_swapgs
859 TRACE_IRQS_ON 935 TRACE_IRQS_ON
860 ENABLE_INTERRUPTS(CLBR_NONE) 936 ENABLE_INTERRUPTS(CLBR_NONE)
861 SAVE_REST 937 SAVE_REST
862 movq $-1,ORIG_RAX(%rsp) 938 movq $-1,ORIG_RAX(%rsp)
863 xorl %esi,%esi # oldset 939 xorl %esi,%esi # oldset
864 movq %rsp,%rdi # &pt_regs 940 movq %rsp,%rdi # &pt_regs
865 call do_notify_resume 941 call do_notify_resume
@@ -881,324 +957,211 @@ ENTRY(retint_kernel)
881 jnc retint_restore_args 957 jnc retint_restore_args
882 call preempt_schedule_irq 958 call preempt_schedule_irq
883 jmp exit_intr 959 jmp exit_intr
884#endif 960#endif
885 961
886 CFI_ENDPROC 962 CFI_ENDPROC
887END(common_interrupt) 963END(common_interrupt)
888 964
889/* 965/*
890 * APIC interrupts. 966 * APIC interrupts.
891 */ 967 */
892 .macro apicinterrupt num,func 968.macro apicinterrupt num sym do_sym
969ENTRY(\sym)
893 INTR_FRAME 970 INTR_FRAME
894 pushq $~(\num) 971 pushq $~(\num)
895 CFI_ADJUST_CFA_OFFSET 8 972 CFI_ADJUST_CFA_OFFSET 8
896 interrupt \func 973 interrupt \do_sym
897 jmp ret_from_intr 974 jmp ret_from_intr
898 CFI_ENDPROC 975 CFI_ENDPROC
899 .endm 976END(\sym)
900 977.endm
901ENTRY(thermal_interrupt)
902 apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
903END(thermal_interrupt)
904
905ENTRY(threshold_interrupt)
906 apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
907END(threshold_interrupt)
908
909#ifdef CONFIG_SMP
910ENTRY(reschedule_interrupt)
911 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
912END(reschedule_interrupt)
913
914 .macro INVALIDATE_ENTRY num
915ENTRY(invalidate_interrupt\num)
916 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
917END(invalidate_interrupt\num)
918 .endm
919 978
920 INVALIDATE_ENTRY 0 979#ifdef CONFIG_SMP
921 INVALIDATE_ENTRY 1 980apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
922 INVALIDATE_ENTRY 2 981 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
923 INVALIDATE_ENTRY 3
924 INVALIDATE_ENTRY 4
925 INVALIDATE_ENTRY 5
926 INVALIDATE_ENTRY 6
927 INVALIDATE_ENTRY 7
928
929ENTRY(call_function_interrupt)
930 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
931END(call_function_interrupt)
932ENTRY(call_function_single_interrupt)
933 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
934END(call_function_single_interrupt)
935ENTRY(irq_move_cleanup_interrupt)
936 apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
937END(irq_move_cleanup_interrupt)
938#endif 982#endif
939 983
940ENTRY(apic_timer_interrupt) 984apicinterrupt UV_BAU_MESSAGE \
941 apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt 985 uv_bau_message_intr1 uv_bau_message_interrupt
942END(apic_timer_interrupt) 986apicinterrupt LOCAL_TIMER_VECTOR \
987 apic_timer_interrupt smp_apic_timer_interrupt
988
989#ifdef CONFIG_SMP
990apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
991 invalidate_interrupt0 smp_invalidate_interrupt
992apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
993 invalidate_interrupt1 smp_invalidate_interrupt
994apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
995 invalidate_interrupt2 smp_invalidate_interrupt
996apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
997 invalidate_interrupt3 smp_invalidate_interrupt
998apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
999 invalidate_interrupt4 smp_invalidate_interrupt
1000apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
1001 invalidate_interrupt5 smp_invalidate_interrupt
1002apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
1003 invalidate_interrupt6 smp_invalidate_interrupt
1004apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
1005 invalidate_interrupt7 smp_invalidate_interrupt
1006#endif
943 1007
944ENTRY(uv_bau_message_intr1) 1008apicinterrupt THRESHOLD_APIC_VECTOR \
945 apicinterrupt 220,uv_bau_message_interrupt 1009 threshold_interrupt mce_threshold_interrupt
946END(uv_bau_message_intr1) 1010apicinterrupt THERMAL_APIC_VECTOR \
1011 thermal_interrupt smp_thermal_interrupt
1012
1013#ifdef CONFIG_SMP
1014apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1015 call_function_single_interrupt smp_call_function_single_interrupt
1016apicinterrupt CALL_FUNCTION_VECTOR \
1017 call_function_interrupt smp_call_function_interrupt
1018apicinterrupt RESCHEDULE_VECTOR \
1019 reschedule_interrupt smp_reschedule_interrupt
1020#endif
947 1021
948ENTRY(error_interrupt) 1022apicinterrupt ERROR_APIC_VECTOR \
949 apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt 1023 error_interrupt smp_error_interrupt
950END(error_interrupt) 1024apicinterrupt SPURIOUS_APIC_VECTOR \
1025 spurious_interrupt smp_spurious_interrupt
951 1026
952ENTRY(spurious_interrupt)
953 apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
954END(spurious_interrupt)
955
956/* 1027/*
957 * Exception entry points. 1028 * Exception entry points.
958 */ 1029 */
959 .macro zeroentry sym 1030.macro zeroentry sym do_sym
1031ENTRY(\sym)
960 INTR_FRAME 1032 INTR_FRAME
961 PARAVIRT_ADJUST_EXCEPTION_FRAME 1033 PARAVIRT_ADJUST_EXCEPTION_FRAME
962 pushq $0 /* push error code/oldrax */ 1034 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
963 CFI_ADJUST_CFA_OFFSET 8 1035 subq $15*8,%rsp
964 pushq %rax /* push real oldrax to the rdi slot */ 1036 CFI_ADJUST_CFA_OFFSET 15*8
965 CFI_ADJUST_CFA_OFFSET 8 1037 call error_entry
966 CFI_REL_OFFSET rax,0 1038 DEFAULT_FRAME 0
967 leaq \sym(%rip),%rax 1039 movq %rsp,%rdi /* pt_regs pointer */
968 jmp error_entry 1040 xorl %esi,%esi /* no error code */
1041 call \do_sym
1042 jmp error_exit /* %ebx: no swapgs flag */
969 CFI_ENDPROC 1043 CFI_ENDPROC
970 .endm 1044END(\sym)
1045.endm
971 1046
972 .macro errorentry sym 1047.macro paranoidzeroentry sym do_sym
973 XCPT_FRAME 1048ENTRY(\sym)
1049 INTR_FRAME
974 PARAVIRT_ADJUST_EXCEPTION_FRAME 1050 PARAVIRT_ADJUST_EXCEPTION_FRAME
975 pushq %rax 1051 pushq $-1 /* ORIG_RAX: no syscall to restart */
976 CFI_ADJUST_CFA_OFFSET 8 1052 CFI_ADJUST_CFA_OFFSET 8
977 CFI_REL_OFFSET rax,0 1053 subq $15*8, %rsp
978 leaq \sym(%rip),%rax 1054 call save_paranoid
979 jmp error_entry 1055 TRACE_IRQS_OFF
1056 movq %rsp,%rdi /* pt_regs pointer */
1057 xorl %esi,%esi /* no error code */
1058 call \do_sym
1059 jmp paranoid_exit /* %ebx: no swapgs flag */
980 CFI_ENDPROC 1060 CFI_ENDPROC
981 .endm 1061END(\sym)
1062.endm
982 1063
983 /* error code is on the stack already */ 1064.macro paranoidzeroentry_ist sym do_sym ist
984 /* handle NMI like exceptions that can happen everywhere */ 1065ENTRY(\sym)
985 .macro paranoidentry sym, ist=0, irqtrace=1 1066 INTR_FRAME
986 SAVE_ALL 1067 PARAVIRT_ADJUST_EXCEPTION_FRAME
987 cld 1068 pushq $-1 /* ORIG_RAX: no syscall to restart */
988 movl $1,%ebx 1069 CFI_ADJUST_CFA_OFFSET 8
989 movl $MSR_GS_BASE,%ecx 1070 subq $15*8, %rsp
990 rdmsr 1071 call save_paranoid
991 testl %edx,%edx
992 js 1f
993 SWAPGS
994 xorl %ebx,%ebx
9951:
996 .if \ist
997 movq %gs:pda_data_offset, %rbp
998 .endif
999 .if \irqtrace
1000 TRACE_IRQS_OFF
1001 .endif
1002 movq %rsp,%rdi
1003 movq ORIG_RAX(%rsp),%rsi
1004 movq $-1,ORIG_RAX(%rsp)
1005 .if \ist
1006 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1007 .endif
1008 call \sym
1009 .if \ist
1010 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1011 .endif
1012 DISABLE_INTERRUPTS(CLBR_NONE)
1013 .if \irqtrace
1014 TRACE_IRQS_OFF 1072 TRACE_IRQS_OFF
1015 .endif 1073 movq %rsp,%rdi /* pt_regs pointer */
1016 .endm 1074 xorl %esi,%esi /* no error code */
1075 movq %gs:pda_data_offset, %rbp
1076 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1077 call \do_sym
1078 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1079 jmp paranoid_exit /* %ebx: no swapgs flag */
1080 CFI_ENDPROC
1081END(\sym)
1082.endm
1017 1083
1018 /* 1084.macro errorentry sym do_sym
1019 * "Paranoid" exit path from exception stack. 1085ENTRY(\sym)
1020 * Paranoid because this is used by NMIs and cannot take 1086 XCPT_FRAME
1021 * any kernel state for granted. 1087 PARAVIRT_ADJUST_EXCEPTION_FRAME
1022 * We don't do kernel preemption checks here, because only 1088 subq $15*8,%rsp
1023 * NMI should be common and it does not enable IRQs and 1089 CFI_ADJUST_CFA_OFFSET 15*8
1024 * cannot get reschedule ticks. 1090 call error_entry
1025 * 1091 DEFAULT_FRAME 0
1026 * "trace" is 0 for the NMI handler only, because irq-tracing 1092 movq %rsp,%rdi /* pt_regs pointer */
1027 * is fundamentally NMI-unsafe. (we cannot change the soft and 1093 movq ORIG_RAX(%rsp),%rsi /* get error code */
1028 * hard flags at once, atomically) 1094 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1029 */ 1095 call \do_sym
1030 .macro paranoidexit trace=1 1096 jmp error_exit /* %ebx: no swapgs flag */
1031 /* ebx: no swapgs flag */
1032paranoid_exit\trace:
1033 testl %ebx,%ebx /* swapgs needed? */
1034 jnz paranoid_restore\trace
1035 testl $3,CS(%rsp)
1036 jnz paranoid_userspace\trace
1037paranoid_swapgs\trace:
1038 .if \trace
1039 TRACE_IRQS_IRETQ 0
1040 .endif
1041 SWAPGS_UNSAFE_STACK
1042paranoid_restore\trace:
1043 RESTORE_ALL 8
1044 jmp irq_return
1045paranoid_userspace\trace:
1046 GET_THREAD_INFO(%rcx)
1047 movl TI_flags(%rcx),%ebx
1048 andl $_TIF_WORK_MASK,%ebx
1049 jz paranoid_swapgs\trace
1050 movq %rsp,%rdi /* &pt_regs */
1051 call sync_regs
1052 movq %rax,%rsp /* switch stack for scheduling */
1053 testl $_TIF_NEED_RESCHED,%ebx
1054 jnz paranoid_schedule\trace
1055 movl %ebx,%edx /* arg3: thread flags */
1056 .if \trace
1057 TRACE_IRQS_ON
1058 .endif
1059 ENABLE_INTERRUPTS(CLBR_NONE)
1060 xorl %esi,%esi /* arg2: oldset */
1061 movq %rsp,%rdi /* arg1: &pt_regs */
1062 call do_notify_resume
1063 DISABLE_INTERRUPTS(CLBR_NONE)
1064 .if \trace
1065 TRACE_IRQS_OFF
1066 .endif
1067 jmp paranoid_userspace\trace
1068paranoid_schedule\trace:
1069 .if \trace
1070 TRACE_IRQS_ON
1071 .endif
1072 ENABLE_INTERRUPTS(CLBR_ANY)
1073 call schedule
1074 DISABLE_INTERRUPTS(CLBR_ANY)
1075 .if \trace
1076 TRACE_IRQS_OFF
1077 .endif
1078 jmp paranoid_userspace\trace
1079 CFI_ENDPROC 1097 CFI_ENDPROC
1080 .endm 1098END(\sym)
1099.endm
1081 1100
1082/* 1101 /* error code is on the stack already */
1083 * Exception entry point. This expects an error code/orig_rax on the stack 1102.macro paranoiderrorentry sym do_sym
1084 * and the exception handler in %rax. 1103ENTRY(\sym)
1085 */ 1104 XCPT_FRAME
1086KPROBE_ENTRY(error_entry) 1105 PARAVIRT_ADJUST_EXCEPTION_FRAME
1087 _frame RDI 1106 subq $15*8,%rsp
1088 CFI_REL_OFFSET rax,0 1107 CFI_ADJUST_CFA_OFFSET 15*8
1089 /* rdi slot contains rax, oldrax contains error code */ 1108 call save_paranoid
1090 cld 1109 DEFAULT_FRAME 0
1091 subq $14*8,%rsp
1092 CFI_ADJUST_CFA_OFFSET (14*8)
1093 movq %rsi,13*8(%rsp)
1094 CFI_REL_OFFSET rsi,RSI
1095 movq 14*8(%rsp),%rsi /* load rax from rdi slot */
1096 CFI_REGISTER rax,rsi
1097 movq %rdx,12*8(%rsp)
1098 CFI_REL_OFFSET rdx,RDX
1099 movq %rcx,11*8(%rsp)
1100 CFI_REL_OFFSET rcx,RCX
1101 movq %rsi,10*8(%rsp) /* store rax */
1102 CFI_REL_OFFSET rax,RAX
1103 movq %r8, 9*8(%rsp)
1104 CFI_REL_OFFSET r8,R8
1105 movq %r9, 8*8(%rsp)
1106 CFI_REL_OFFSET r9,R9
1107 movq %r10,7*8(%rsp)
1108 CFI_REL_OFFSET r10,R10
1109 movq %r11,6*8(%rsp)
1110 CFI_REL_OFFSET r11,R11
1111 movq %rbx,5*8(%rsp)
1112 CFI_REL_OFFSET rbx,RBX
1113 movq %rbp,4*8(%rsp)
1114 CFI_REL_OFFSET rbp,RBP
1115 movq %r12,3*8(%rsp)
1116 CFI_REL_OFFSET r12,R12
1117 movq %r13,2*8(%rsp)
1118 CFI_REL_OFFSET r13,R13
1119 movq %r14,1*8(%rsp)
1120 CFI_REL_OFFSET r14,R14
1121 movq %r15,(%rsp)
1122 CFI_REL_OFFSET r15,R15
1123 xorl %ebx,%ebx
1124 testl $3,CS(%rsp)
1125 je error_kernelspace
1126error_swapgs:
1127 SWAPGS
1128error_sti:
1129 TRACE_IRQS_OFF
1130 movq %rdi,RDI(%rsp)
1131 CFI_REL_OFFSET rdi,RDI
1132 movq %rsp,%rdi
1133 movq ORIG_RAX(%rsp),%rsi /* get error code */
1134 movq $-1,ORIG_RAX(%rsp)
1135 call *%rax
1136 /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1137error_exit:
1138 movl %ebx,%eax
1139 RESTORE_REST
1140 DISABLE_INTERRUPTS(CLBR_NONE)
1141 TRACE_IRQS_OFF 1110 TRACE_IRQS_OFF
1142 GET_THREAD_INFO(%rcx) 1111 movq %rsp,%rdi /* pt_regs pointer */
1143 testl %eax,%eax 1112 movq ORIG_RAX(%rsp),%rsi /* get error code */
1144 jne retint_kernel 1113 movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
1145 LOCKDEP_SYS_EXIT_IRQ 1114 call \do_sym
1146 movl TI_flags(%rcx),%edx 1115 jmp paranoid_exit /* %ebx: no swapgs flag */
1147 movl $_TIF_WORK_MASK,%edi
1148 andl %edi,%edx
1149 jnz retint_careful
1150 jmp retint_swapgs
1151 CFI_ENDPROC 1116 CFI_ENDPROC
1117END(\sym)
1118.endm
1152 1119
1153error_kernelspace: 1120zeroentry divide_error do_divide_error
1154 incl %ebx 1121zeroentry overflow do_overflow
1155 /* There are two places in the kernel that can potentially fault with 1122zeroentry bounds do_bounds
1156 usergs. Handle them here. The exception handlers after 1123zeroentry invalid_op do_invalid_op
1157 iret run with kernel gs again, so don't set the user space flag. 1124zeroentry device_not_available do_device_not_available
1158 B stepping K8s sometimes report an truncated RIP for IRET 1125paranoiderrorentry double_fault do_double_fault
1159 exceptions returning to compat mode. Check for these here too. */ 1126zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1160 leaq irq_return(%rip),%rcx 1127errorentry invalid_TSS do_invalid_TSS
1161 cmpq %rcx,RIP(%rsp) 1128errorentry segment_not_present do_segment_not_present
1162 je error_swapgs 1129zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1163 movl %ecx,%ecx /* zero extend */ 1130zeroentry coprocessor_error do_coprocessor_error
1164 cmpq %rcx,RIP(%rsp) 1131errorentry alignment_check do_alignment_check
1165 je error_swapgs 1132zeroentry simd_coprocessor_error do_simd_coprocessor_error
1166 cmpq $gs_change,RIP(%rsp) 1133
1167 je error_swapgs 1134 /* Reload gs selector with exception handling */
1168 jmp error_sti 1135 /* edi: new selector */
1169KPROBE_END(error_entry)
1170
1171 /* Reload gs selector with exception handling */
1172 /* edi: new selector */
1173ENTRY(native_load_gs_index) 1136ENTRY(native_load_gs_index)
1174 CFI_STARTPROC 1137 CFI_STARTPROC
1175 pushf 1138 pushf
1176 CFI_ADJUST_CFA_OFFSET 8 1139 CFI_ADJUST_CFA_OFFSET 8
1177 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1140 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
1178 SWAPGS 1141 SWAPGS
1179gs_change: 1142gs_change:
1180 movl %edi,%gs 1143 movl %edi,%gs
11812: mfence /* workaround */ 11442: mfence /* workaround */
1182 SWAPGS 1145 SWAPGS
1183 popf 1146 popf
1184 CFI_ADJUST_CFA_OFFSET -8 1147 CFI_ADJUST_CFA_OFFSET -8
1185 ret 1148 ret
1186 CFI_ENDPROC 1149 CFI_ENDPROC
1187ENDPROC(native_load_gs_index) 1150END(native_load_gs_index)
1188 1151
1189 .section __ex_table,"a" 1152 .section __ex_table,"a"
1190 .align 8 1153 .align 8
1191 .quad gs_change,bad_gs 1154 .quad gs_change,bad_gs
1192 .previous 1155 .previous
1193 .section .fixup,"ax" 1156 .section .fixup,"ax"
1194 /* running with kernelgs */ 1157 /* running with kernelgs */
1195bad_gs: 1158bad_gs:
1196 SWAPGS /* switch back to user gs */ 1159 SWAPGS /* switch back to user gs */
1197 xorl %eax,%eax 1160 xorl %eax,%eax
1198 movl %eax,%gs 1161 movl %eax,%gs
1199 jmp 2b 1162 jmp 2b
1200 .previous 1163 .previous
1201 1164
1202/* 1165/*
1203 * Create a kernel thread. 1166 * Create a kernel thread.
1204 * 1167 *
@@ -1221,7 +1184,7 @@ ENTRY(kernel_thread)
1221 1184
1222 xorl %r8d,%r8d 1185 xorl %r8d,%r8d
1223 xorl %r9d,%r9d 1186 xorl %r9d,%r9d
1224 1187
1225 # clone now 1188 # clone now
1226 call do_fork 1189 call do_fork
1227 movq %rax,RAX(%rsp) 1190 movq %rax,RAX(%rsp)
@@ -1232,15 +1195,15 @@ ENTRY(kernel_thread)
1232 * so internally to the x86_64 port you can rely on kernel_thread() 1195 * so internally to the x86_64 port you can rely on kernel_thread()
1233 * not to reschedule the child before returning, this avoids the need 1196 * not to reschedule the child before returning, this avoids the need
1234 * of hacks for example to fork off the per-CPU idle tasks. 1197 * of hacks for example to fork off the per-CPU idle tasks.
1235 * [Hopefully no generic code relies on the reschedule -AK] 1198 * [Hopefully no generic code relies on the reschedule -AK]
1236 */ 1199 */
1237 RESTORE_ALL 1200 RESTORE_ALL
1238 UNFAKE_STACK_FRAME 1201 UNFAKE_STACK_FRAME
1239 ret 1202 ret
1240 CFI_ENDPROC 1203 CFI_ENDPROC
1241ENDPROC(kernel_thread) 1204END(kernel_thread)
1242 1205
1243child_rip: 1206ENTRY(child_rip)
1244 pushq $0 # fake return address 1207 pushq $0 # fake return address
1245 CFI_STARTPROC 1208 CFI_STARTPROC
1246 /* 1209 /*
@@ -1253,8 +1216,9 @@ child_rip:
1253 # exit 1216 # exit
1254 mov %eax, %edi 1217 mov %eax, %edi
1255 call do_exit 1218 call do_exit
1219 ud2 # padding for call trace
1256 CFI_ENDPROC 1220 CFI_ENDPROC
1257ENDPROC(child_rip) 1221END(child_rip)
1258 1222
1259/* 1223/*
1260 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1224 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1274,10 +1238,10 @@ ENDPROC(child_rip)
1274ENTRY(kernel_execve) 1238ENTRY(kernel_execve)
1275 CFI_STARTPROC 1239 CFI_STARTPROC
1276 FAKE_STACK_FRAME $0 1240 FAKE_STACK_FRAME $0
1277 SAVE_ALL 1241 SAVE_ALL
1278 movq %rsp,%rcx 1242 movq %rsp,%rcx
1279 call sys_execve 1243 call sys_execve
1280 movq %rax, RAX(%rsp) 1244 movq %rax, RAX(%rsp)
1281 RESTORE_REST 1245 RESTORE_REST
1282 testq %rax,%rax 1246 testq %rax,%rax
1283 je int_ret_from_sys_call 1247 je int_ret_from_sys_call
@@ -1285,129 +1249,7 @@ ENTRY(kernel_execve)
1285 UNFAKE_STACK_FRAME 1249 UNFAKE_STACK_FRAME
1286 ret 1250 ret
1287 CFI_ENDPROC 1251 CFI_ENDPROC
1288ENDPROC(kernel_execve) 1252END(kernel_execve)
1289
1290KPROBE_ENTRY(page_fault)
1291 errorentry do_page_fault
1292KPROBE_END(page_fault)
1293
1294ENTRY(coprocessor_error)
1295 zeroentry do_coprocessor_error
1296END(coprocessor_error)
1297
1298ENTRY(simd_coprocessor_error)
1299 zeroentry do_simd_coprocessor_error
1300END(simd_coprocessor_error)
1301
1302ENTRY(device_not_available)
1303 zeroentry do_device_not_available
1304END(device_not_available)
1305
1306 /* runs on exception stack */
1307KPROBE_ENTRY(debug)
1308 INTR_FRAME
1309 PARAVIRT_ADJUST_EXCEPTION_FRAME
1310 pushq $0
1311 CFI_ADJUST_CFA_OFFSET 8
1312 paranoidentry do_debug, DEBUG_STACK
1313 paranoidexit
1314KPROBE_END(debug)
1315
1316 /* runs on exception stack */
1317KPROBE_ENTRY(nmi)
1318 INTR_FRAME
1319 PARAVIRT_ADJUST_EXCEPTION_FRAME
1320 pushq $-1
1321 CFI_ADJUST_CFA_OFFSET 8
1322 paranoidentry do_nmi, 0, 0
1323#ifdef CONFIG_TRACE_IRQFLAGS
1324 paranoidexit 0
1325#else
1326 jmp paranoid_exit1
1327 CFI_ENDPROC
1328#endif
1329KPROBE_END(nmi)
1330
1331KPROBE_ENTRY(int3)
1332 INTR_FRAME
1333 PARAVIRT_ADJUST_EXCEPTION_FRAME
1334 pushq $0
1335 CFI_ADJUST_CFA_OFFSET 8
1336 paranoidentry do_int3, DEBUG_STACK
1337 jmp paranoid_exit1
1338 CFI_ENDPROC
1339KPROBE_END(int3)
1340
1341ENTRY(overflow)
1342 zeroentry do_overflow
1343END(overflow)
1344
1345ENTRY(bounds)
1346 zeroentry do_bounds
1347END(bounds)
1348
1349ENTRY(invalid_op)
1350 zeroentry do_invalid_op
1351END(invalid_op)
1352
1353ENTRY(coprocessor_segment_overrun)
1354 zeroentry do_coprocessor_segment_overrun
1355END(coprocessor_segment_overrun)
1356
1357 /* runs on exception stack */
1358ENTRY(double_fault)
1359 XCPT_FRAME
1360 PARAVIRT_ADJUST_EXCEPTION_FRAME
1361 paranoidentry do_double_fault
1362 jmp paranoid_exit1
1363 CFI_ENDPROC
1364END(double_fault)
1365
1366ENTRY(invalid_TSS)
1367 errorentry do_invalid_TSS
1368END(invalid_TSS)
1369
1370ENTRY(segment_not_present)
1371 errorentry do_segment_not_present
1372END(segment_not_present)
1373
1374 /* runs on exception stack */
1375ENTRY(stack_segment)
1376 XCPT_FRAME
1377 PARAVIRT_ADJUST_EXCEPTION_FRAME
1378 paranoidentry do_stack_segment
1379 jmp paranoid_exit1
1380 CFI_ENDPROC
1381END(stack_segment)
1382
1383KPROBE_ENTRY(general_protection)
1384 errorentry do_general_protection
1385KPROBE_END(general_protection)
1386
1387ENTRY(alignment_check)
1388 errorentry do_alignment_check
1389END(alignment_check)
1390
1391ENTRY(divide_error)
1392 zeroentry do_divide_error
1393END(divide_error)
1394
1395ENTRY(spurious_interrupt_bug)
1396 zeroentry do_spurious_interrupt_bug
1397END(spurious_interrupt_bug)
1398
1399#ifdef CONFIG_X86_MCE
1400 /* runs on exception stack */
1401ENTRY(machine_check)
1402 INTR_FRAME
1403 PARAVIRT_ADJUST_EXCEPTION_FRAME
1404 pushq $0
1405 CFI_ADJUST_CFA_OFFSET 8
1406 paranoidentry do_machine_check
1407 jmp paranoid_exit1
1408 CFI_ENDPROC
1409END(machine_check)
1410#endif
1411 1253
1412/* Call softirq on interrupt stack. Interrupts are off. */ 1254/* Call softirq on interrupt stack. Interrupts are off. */
1413ENTRY(call_softirq) 1255ENTRY(call_softirq)
@@ -1427,40 +1269,33 @@ ENTRY(call_softirq)
1427 decl %gs:pda_irqcount 1269 decl %gs:pda_irqcount
1428 ret 1270 ret
1429 CFI_ENDPROC 1271 CFI_ENDPROC
1430ENDPROC(call_softirq) 1272END(call_softirq)
1431
1432KPROBE_ENTRY(ignore_sysret)
1433 CFI_STARTPROC
1434 mov $-ENOSYS,%eax
1435 sysret
1436 CFI_ENDPROC
1437ENDPROC(ignore_sysret)
1438 1273
1439#ifdef CONFIG_XEN 1274#ifdef CONFIG_XEN
1440ENTRY(xen_hypervisor_callback) 1275zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1441 zeroentry xen_do_hypervisor_callback
1442END(xen_hypervisor_callback)
1443 1276
1444/* 1277/*
1445# A note on the "critical region" in our callback handler. 1278 * A note on the "critical region" in our callback handler.
1446# We want to avoid stacking callback handlers due to events occurring 1279 * We want to avoid stacking callback handlers due to events occurring
1447# during handling of the last event. To do this, we keep events disabled 1280 * during handling of the last event. To do this, we keep events disabled
1448# until we've done all processing. HOWEVER, we must enable events before 1281 * until we've done all processing. HOWEVER, we must enable events before
1449# popping the stack frame (can't be done atomically) and so it would still 1282 * popping the stack frame (can't be done atomically) and so it would still
1450# be possible to get enough handler activations to overflow the stack. 1283 * be possible to get enough handler activations to overflow the stack.
1451# Although unlikely, bugs of that kind are hard to track down, so we'd 1284 * Although unlikely, bugs of that kind are hard to track down, so we'd
1452# like to avoid the possibility. 1285 * like to avoid the possibility.
1453# So, on entry to the handler we detect whether we interrupted an 1286 * So, on entry to the handler we detect whether we interrupted an
1454# existing activation in its critical region -- if so, we pop the current 1287 * existing activation in its critical region -- if so, we pop the current
1455# activation and restart the handler using the previous one. 1288 * activation and restart the handler using the previous one.
1456*/ 1289 */
1457ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) 1290ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1458 CFI_STARTPROC 1291 CFI_STARTPROC
1459/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will 1292/*
1460 see the correct pointer to the pt_regs */ 1293 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1294 * see the correct pointer to the pt_regs
1295 */
1461 movq %rdi, %rsp # we don't return, adjust the stack frame 1296 movq %rdi, %rsp # we don't return, adjust the stack frame
1462 CFI_ENDPROC 1297 CFI_ENDPROC
1463 CFI_DEFAULT_STACK 1298 DEFAULT_FRAME
146411: incl %gs:pda_irqcount 129911: incl %gs:pda_irqcount
1465 movq %rsp,%rbp 1300 movq %rsp,%rbp
1466 CFI_DEF_CFA_REGISTER rbp 1301 CFI_DEF_CFA_REGISTER rbp
@@ -1475,23 +1310,26 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1475END(do_hypervisor_callback) 1310END(do_hypervisor_callback)
1476 1311
1477/* 1312/*
1478# Hypervisor uses this for application faults while it executes. 1313 * Hypervisor uses this for application faults while it executes.
1479# We get here for two reasons: 1314 * We get here for two reasons:
1480# 1. Fault while reloading DS, ES, FS or GS 1315 * 1. Fault while reloading DS, ES, FS or GS
1481# 2. Fault while executing IRET 1316 * 2. Fault while executing IRET
1482# Category 1 we do not need to fix up as Xen has already reloaded all segment 1317 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1483# registers that could be reloaded and zeroed the others. 1318 * registers that could be reloaded and zeroed the others.
1484# Category 2 we fix up by killing the current process. We cannot use the 1319 * Category 2 we fix up by killing the current process. We cannot use the
1485# normal Linux return path in this case because if we use the IRET hypercall 1320 * normal Linux return path in this case because if we use the IRET hypercall
1486# to pop the stack frame we end up in an infinite loop of failsafe callbacks. 1321 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1487# We distinguish between categories by comparing each saved segment register 1322 * We distinguish between categories by comparing each saved segment register
1488# with its current contents: any discrepancy means we in category 1. 1323 * with its current contents: any discrepancy means we in category 1.
1489*/ 1324 */
1490ENTRY(xen_failsafe_callback) 1325ENTRY(xen_failsafe_callback)
1491 framesz = (RIP-0x30) /* workaround buggy gas */ 1326 INTR_FRAME 1 (6*8)
1492 _frame framesz 1327 /*CFI_REL_OFFSET gs,GS*/
1493 CFI_REL_OFFSET rcx, 0 1328 /*CFI_REL_OFFSET fs,FS*/
1494 CFI_REL_OFFSET r11, 8 1329 /*CFI_REL_OFFSET es,ES*/
1330 /*CFI_REL_OFFSET ds,DS*/
1331 CFI_REL_OFFSET r11,8
1332 CFI_REL_OFFSET rcx,0
1495 movw %ds,%cx 1333 movw %ds,%cx
1496 cmpw %cx,0x10(%rsp) 1334 cmpw %cx,0x10(%rsp)
1497 CFI_REMEMBER_STATE 1335 CFI_REMEMBER_STATE
@@ -1512,12 +1350,9 @@ ENTRY(xen_failsafe_callback)
1512 CFI_RESTORE r11 1350 CFI_RESTORE r11
1513 addq $0x30,%rsp 1351 addq $0x30,%rsp
1514 CFI_ADJUST_CFA_OFFSET -0x30 1352 CFI_ADJUST_CFA_OFFSET -0x30
1515 pushq $0 1353 pushq_cfi $0 /* RIP */
1516 CFI_ADJUST_CFA_OFFSET 8 1354 pushq_cfi %r11
1517 pushq %r11 1355 pushq_cfi %rcx
1518 CFI_ADJUST_CFA_OFFSET 8
1519 pushq %rcx
1520 CFI_ADJUST_CFA_OFFSET 8
1521 jmp general_protection 1356 jmp general_protection
1522 CFI_RESTORE_STATE 1357 CFI_RESTORE_STATE
15231: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ 13581: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
@@ -1527,11 +1362,223 @@ ENTRY(xen_failsafe_callback)
1527 CFI_RESTORE r11 1362 CFI_RESTORE r11
1528 addq $0x30,%rsp 1363 addq $0x30,%rsp
1529 CFI_ADJUST_CFA_OFFSET -0x30 1364 CFI_ADJUST_CFA_OFFSET -0x30
1530 pushq $0 1365 pushq_cfi $0
1531 CFI_ADJUST_CFA_OFFSET 8
1532 SAVE_ALL 1366 SAVE_ALL
1533 jmp error_exit 1367 jmp error_exit
1534 CFI_ENDPROC 1368 CFI_ENDPROC
1535END(xen_failsafe_callback) 1369END(xen_failsafe_callback)
1536 1370
1537#endif /* CONFIG_XEN */ 1371#endif /* CONFIG_XEN */
1372
1373/*
1374 * Some functions should be protected against kprobes
1375 */
1376 .pushsection .kprobes.text, "ax"
1377
1378paranoidzeroentry_ist debug do_debug DEBUG_STACK
1379paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1380paranoiderrorentry stack_segment do_stack_segment
1381errorentry general_protection do_general_protection
1382errorentry page_fault do_page_fault
1383#ifdef CONFIG_X86_MCE
1384paranoidzeroentry machine_check do_machine_check
1385#endif
1386
1387 /*
1388 * "Paranoid" exit path from exception stack.
1389 * Paranoid because this is used by NMIs and cannot take
1390 * any kernel state for granted.
1391 * We don't do kernel preemption checks here, because only
1392 * NMI should be common and it does not enable IRQs and
1393 * cannot get reschedule ticks.
1394 *
1395 * "trace" is 0 for the NMI handler only, because irq-tracing
1396 * is fundamentally NMI-unsafe. (we cannot change the soft and
1397 * hard flags at once, atomically)
1398 */
1399
1400 /* ebx: no swapgs flag */
1401ENTRY(paranoid_exit)
1402 INTR_FRAME
1403 DISABLE_INTERRUPTS(CLBR_NONE)
1404 TRACE_IRQS_OFF
1405 testl %ebx,%ebx /* swapgs needed? */
1406 jnz paranoid_restore
1407 testl $3,CS(%rsp)
1408 jnz paranoid_userspace
1409paranoid_swapgs:
1410 TRACE_IRQS_IRETQ 0
1411 SWAPGS_UNSAFE_STACK
1412paranoid_restore:
1413 RESTORE_ALL 8
1414 jmp irq_return
1415paranoid_userspace:
1416 GET_THREAD_INFO(%rcx)
1417 movl TI_flags(%rcx),%ebx
1418 andl $_TIF_WORK_MASK,%ebx
1419 jz paranoid_swapgs
1420 movq %rsp,%rdi /* &pt_regs */
1421 call sync_regs
1422 movq %rax,%rsp /* switch stack for scheduling */
1423 testl $_TIF_NEED_RESCHED,%ebx
1424 jnz paranoid_schedule
1425 movl %ebx,%edx /* arg3: thread flags */
1426 TRACE_IRQS_ON
1427 ENABLE_INTERRUPTS(CLBR_NONE)
1428 xorl %esi,%esi /* arg2: oldset */
1429 movq %rsp,%rdi /* arg1: &pt_regs */
1430 call do_notify_resume
1431 DISABLE_INTERRUPTS(CLBR_NONE)
1432 TRACE_IRQS_OFF
1433 jmp paranoid_userspace
1434paranoid_schedule:
1435 TRACE_IRQS_ON
1436 ENABLE_INTERRUPTS(CLBR_ANY)
1437 call schedule
1438 DISABLE_INTERRUPTS(CLBR_ANY)
1439 TRACE_IRQS_OFF
1440 jmp paranoid_userspace
1441 CFI_ENDPROC
1442END(paranoid_exit)
1443
1444/*
1445 * Exception entry point. This expects an error code/orig_rax on the stack.
1446 * returns in "no swapgs flag" in %ebx.
1447 */
1448ENTRY(error_entry)
1449 XCPT_FRAME
1450 CFI_ADJUST_CFA_OFFSET 15*8
1451 /* oldrax contains error code */
1452 cld
1453 movq_cfi rdi, RDI+8
1454 movq_cfi rsi, RSI+8
1455 movq_cfi rdx, RDX+8
1456 movq_cfi rcx, RCX+8
1457 movq_cfi rax, RAX+8
1458 movq_cfi r8, R8+8
1459 movq_cfi r9, R9+8
1460 movq_cfi r10, R10+8
1461 movq_cfi r11, R11+8
1462 movq_cfi rbx, RBX+8
1463 movq_cfi rbp, RBP+8
1464 movq_cfi r12, R12+8
1465 movq_cfi r13, R13+8
1466 movq_cfi r14, R14+8
1467 movq_cfi r15, R15+8
1468 xorl %ebx,%ebx
1469 testl $3,CS+8(%rsp)
1470 je error_kernelspace
1471error_swapgs:
1472 SWAPGS
1473error_sti:
1474 TRACE_IRQS_OFF
1475 ret
1476 CFI_ENDPROC
1477
1478/*
1479 * There are two places in the kernel that can potentially fault with
1480 * usergs. Handle them here. The exception handlers after iret run with
1481 * kernel gs again, so don't set the user space flag. B stepping K8s
1482 * sometimes report an truncated RIP for IRET exceptions returning to
1483 * compat mode. Check for these here too.
1484 */
1485error_kernelspace:
1486 incl %ebx
1487 leaq irq_return(%rip),%rcx
1488 cmpq %rcx,RIP+8(%rsp)
1489 je error_swapgs
1490 movl %ecx,%ecx /* zero extend */
1491 cmpq %rcx,RIP+8(%rsp)
1492 je error_swapgs
1493 cmpq $gs_change,RIP+8(%rsp)
1494 je error_swapgs
1495 jmp error_sti
1496END(error_entry)
1497
1498
1499/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
1500ENTRY(error_exit)
1501 DEFAULT_FRAME
1502 movl %ebx,%eax
1503 RESTORE_REST
1504 DISABLE_INTERRUPTS(CLBR_NONE)
1505 TRACE_IRQS_OFF
1506 GET_THREAD_INFO(%rcx)
1507 testl %eax,%eax
1508 jne retint_kernel
1509 LOCKDEP_SYS_EXIT_IRQ
1510 movl TI_flags(%rcx),%edx
1511 movl $_TIF_WORK_MASK,%edi
1512 andl %edi,%edx
1513 jnz retint_careful
1514 jmp retint_swapgs
1515 CFI_ENDPROC
1516END(error_exit)
1517
1518
1519 /* runs on exception stack */
1520ENTRY(nmi)
1521 INTR_FRAME
1522 PARAVIRT_ADJUST_EXCEPTION_FRAME
1523 pushq_cfi $-1
1524 subq $15*8, %rsp
1525 CFI_ADJUST_CFA_OFFSET 15*8
1526 call save_paranoid
1527 DEFAULT_FRAME 0
1528 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1529 movq %rsp,%rdi
1530 movq $-1,%rsi
1531 call do_nmi
1532#ifdef CONFIG_TRACE_IRQFLAGS
1533 /* paranoidexit; without TRACE_IRQS_OFF */
1534 /* ebx: no swapgs flag */
1535 DISABLE_INTERRUPTS(CLBR_NONE)
1536 testl %ebx,%ebx /* swapgs needed? */
1537 jnz nmi_restore
1538 testl $3,CS(%rsp)
1539 jnz nmi_userspace
1540nmi_swapgs:
1541 SWAPGS_UNSAFE_STACK
1542nmi_restore:
1543 RESTORE_ALL 8
1544 jmp irq_return
1545nmi_userspace:
1546 GET_THREAD_INFO(%rcx)
1547 movl TI_flags(%rcx),%ebx
1548 andl $_TIF_WORK_MASK,%ebx
1549 jz nmi_swapgs
1550 movq %rsp,%rdi /* &pt_regs */
1551 call sync_regs
1552 movq %rax,%rsp /* switch stack for scheduling */
1553 testl $_TIF_NEED_RESCHED,%ebx
1554 jnz nmi_schedule
1555 movl %ebx,%edx /* arg3: thread flags */
1556 ENABLE_INTERRUPTS(CLBR_NONE)
1557 xorl %esi,%esi /* arg2: oldset */
1558 movq %rsp,%rdi /* arg1: &pt_regs */
1559 call do_notify_resume
1560 DISABLE_INTERRUPTS(CLBR_NONE)
1561 jmp nmi_userspace
1562nmi_schedule:
1563 ENABLE_INTERRUPTS(CLBR_ANY)
1564 call schedule
1565 DISABLE_INTERRUPTS(CLBR_ANY)
1566 jmp nmi_userspace
1567 CFI_ENDPROC
1568#else
1569 jmp paranoid_exit
1570 CFI_ENDPROC
1571#endif
1572END(nmi)
1573
1574ENTRY(ignore_sysret)
1575 CFI_STARTPROC
1576 mov $-ENOSYS,%eax
1577 sysret
1578 CFI_ENDPROC
1579END(ignore_sysret)
1580
1581/*
1582 * End of kprobes section
1583 */
1584 .popsection
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 0e88be11227d..b193e082f6ce 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpu.h>
13#include <linux/cpumask.h> 14#include <linux/cpumask.h>
14#include <linux/string.h> 15#include <linux/string.h>
15#include <linux/ctype.h> 16#include <linux/ctype.h>
@@ -17,6 +18,9 @@
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/hardirq.h> 20#include <linux/hardirq.h>
21#include <linux/timer.h>
22#include <linux/proc_fs.h>
23#include <asm/current.h>
20#include <asm/smp.h> 24#include <asm/smp.h>
21#include <asm/ipi.h> 25#include <asm/ipi.h>
22#include <asm/genapic.h> 26#include <asm/genapic.h>
@@ -383,6 +387,103 @@ static __init void uv_rtc_init(void)
383} 387}
384 388
385/* 389/*
390 * percpu heartbeat timer
391 */
392static void uv_heartbeat(unsigned long ignored)
393{
394 struct timer_list *timer = &uv_hub_info->scir.timer;
395 unsigned char bits = uv_hub_info->scir.state;
396
397 /* flip heartbeat bit */
398 bits ^= SCIR_CPU_HEARTBEAT;
399
400 /* is this cpu idle? */
401 if (idle_cpu(raw_smp_processor_id()))
402 bits &= ~SCIR_CPU_ACTIVITY;
403 else
404 bits |= SCIR_CPU_ACTIVITY;
405
406 /* update system controller interface reg */
407 uv_set_scir_bits(bits);
408
409 /* enable next timer period */
410 mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
411}
412
413static void __cpuinit uv_heartbeat_enable(int cpu)
414{
415 if (!uv_cpu_hub_info(cpu)->scir.enabled) {
416 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
417
418 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
419 setup_timer(timer, uv_heartbeat, cpu);
420 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
421 add_timer_on(timer, cpu);
422 uv_cpu_hub_info(cpu)->scir.enabled = 1;
423 }
424
425 /* check boot cpu */
426 if (!uv_cpu_hub_info(0)->scir.enabled)
427 uv_heartbeat_enable(0);
428}
429
430#ifdef CONFIG_HOTPLUG_CPU
431static void __cpuinit uv_heartbeat_disable(int cpu)
432{
433 if (uv_cpu_hub_info(cpu)->scir.enabled) {
434 uv_cpu_hub_info(cpu)->scir.enabled = 0;
435 del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
436 }
437 uv_set_cpu_scir_bits(cpu, 0xff);
438}
439
440/*
441 * cpu hotplug notifier
442 */
443static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
444 unsigned long action, void *hcpu)
445{
446 long cpu = (long)hcpu;
447
448 switch (action) {
449 case CPU_ONLINE:
450 uv_heartbeat_enable(cpu);
451 break;
452 case CPU_DOWN_PREPARE:
453 uv_heartbeat_disable(cpu);
454 break;
455 default:
456 break;
457 }
458 return NOTIFY_OK;
459}
460
461static __init void uv_scir_register_cpu_notifier(void)
462{
463 hotcpu_notifier(uv_scir_cpu_notify, 0);
464}
465
466#else /* !CONFIG_HOTPLUG_CPU */
467
468static __init void uv_scir_register_cpu_notifier(void)
469{
470}
471
472static __init int uv_init_heartbeat(void)
473{
474 int cpu;
475
476 if (is_uv_system())
477 for_each_online_cpu(cpu)
478 uv_heartbeat_enable(cpu);
479 return 0;
480}
481
482late_initcall(uv_init_heartbeat);
483
484#endif /* !CONFIG_HOTPLUG_CPU */
485
486/*
386 * Called on each cpu to initialize the per_cpu UV data area. 487 * Called on each cpu to initialize the per_cpu UV data area.
387 * ZZZ hotplug not supported yet 488 * ZZZ hotplug not supported yet
388 */ 489 */
@@ -455,7 +556,7 @@ void __init uv_system_init(void)
455 556
456 uv_bios_init(); 557 uv_bios_init();
457 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 558 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
458 &uv_coherency_id, &uv_region_size); 559 &sn_coherency_id, &sn_region_size);
459 uv_rtc_init(); 560 uv_rtc_init();
460 561
461 for_each_present_cpu(cpu) { 562 for_each_present_cpu(cpu) {
@@ -466,8 +567,7 @@ void __init uv_system_init(void)
466 uv_blade_info[blade].nr_possible_cpus++; 567 uv_blade_info[blade].nr_possible_cpus++;
467 568
468 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; 569 uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
469 uv_cpu_hub_info(cpu)->lowmem_remap_top = 570 uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
470 lowmem_redir_base + lowmem_redir_size;
471 uv_cpu_hub_info(cpu)->m_val = m_val; 571 uv_cpu_hub_info(cpu)->m_val = m_val;
472 uv_cpu_hub_info(cpu)->n_val = m_val; 572 uv_cpu_hub_info(cpu)->n_val = m_val;
473 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 573 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
@@ -477,7 +577,8 @@ void __init uv_system_init(void)
477 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 577 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
478 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 578 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
479 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 579 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
480 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; 580 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
581 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
481 uv_node_to_blade[nid] = blade; 582 uv_node_to_blade[nid] = blade;
482 uv_cpu_to_blade[cpu] = blade; 583 uv_cpu_to_blade[cpu] = blade;
483 max_pnode = max(pnode, max_pnode); 584 max_pnode = max(pnode, max_pnode);
@@ -494,4 +595,6 @@ void __init uv_system_init(void)
494 map_mmioh_high(max_pnode); 595 map_mmioh_high(max_pnode);
495 596
496 uv_cpu_init(); 597 uv_cpu_init();
598 uv_scir_register_cpu_notifier();
599 proc_mkdir("sgi_uv", NULL);
497} 600}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 1dcb0f13897e..3e66bd364a9d 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,7 +35,6 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
39 38
40 /* Fixup: bios puts an EBDA in the top 64K segment */ 39 /* Fixup: bios puts an EBDA in the top 64K segment */
41 /* of conventional memory, but does not adjust lowmem. */ 40 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index fa1d25dd83e3..ac108d1fe182 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -12,9 +12,12 @@
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/bios_ebda.h>
15#include <asm/trampoline.h>
15 16
16void __init i386_start_kernel(void) 17void __init i386_start_kernel(void)
17{ 18{
19 reserve_trampoline_memory();
20
18 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 21 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
19 22
20#ifdef CONFIG_BLK_DEV_INITRD 23#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f90649..388e05a5fc17 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,6 +24,7 @@
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h>
27 28
28/* boot cpu pda */ 29/* boot cpu pda */
29static struct x8664_pda _boot_cpu_pda __read_mostly; 30static struct x8664_pda _boot_cpu_pda __read_mostly;
@@ -120,6 +121,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
120{ 121{
121 copy_bootdata(__va(real_mode_data)); 122 copy_bootdata(__va(real_mode_data));
122 123
124 reserve_trampoline_memory();
125
123 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS"); 126 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
124 127
125#ifdef CONFIG_BLK_DEV_INITRD 128#ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e76d7e272974..cd759ad90690 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -33,7 +33,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 34 */
35unsigned long hpet_address; 35unsigned long hpet_address;
36unsigned long hpet_num_timers; 36#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers;
38#endif
37static void __iomem *hpet_virt_address; 39static void __iomem *hpet_virt_address;
38 40
39struct hpet_dev { 41struct hpet_dev {
@@ -811,7 +813,7 @@ int __init hpet_enable(void)
811 813
812out_nohpet: 814out_nohpet:
813 hpet_clear_mapping(); 815 hpet_clear_mapping();
814 boot_hpet_disable = 1; 816 hpet_address = 0;
815 return 0; 817 return 0;
816} 818}
817 819
@@ -834,10 +836,11 @@ static __init int hpet_late_init(void)
834 836
835 hpet_address = force_hpet_address; 837 hpet_address = force_hpet_address;
836 hpet_enable(); 838 hpet_enable();
837 if (!hpet_virt_address)
838 return -ENODEV;
839 } 839 }
840 840
841 if (!hpet_virt_address)
842 return -ENODEV;
843
841 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 844 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
842 845
843 for_each_online_cpu(cpu) { 846 for_each_online_cpu(cpu) {
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c1..d39918076bb4 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -14,7 +14,6 @@ static struct fs_struct init_fs = INIT_FS;
14static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 14static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 15static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
16struct mm_struct init_mm = INIT_MM(init_mm); 16struct mm_struct init_mm = INIT_MM(init_mm);
17EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
18 17
19/* 18/*
20 * Initial thread structure. 19 * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1cbf7c8d46e0..3e070bb961d7 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -2472,10 +2472,9 @@ static void set_ir_ioapic_affinity_irq(unsigned int irq,
2472asmlinkage void smp_irq_move_cleanup_interrupt(void) 2472asmlinkage void smp_irq_move_cleanup_interrupt(void)
2473{ 2473{
2474 unsigned vector, me; 2474 unsigned vector, me;
2475
2475 ack_APIC_irq(); 2476 ack_APIC_irq();
2476#ifdef CONFIG_X86_64
2477 exit_idle(); 2477 exit_idle();
2478#endif
2479 irq_enter(); 2478 irq_enter();
2480 2479
2481 me = smp_processor_id(); 2480 me = smp_processor_id();
@@ -2520,7 +2519,7 @@ static void irq_complete_move(struct irq_desc **descp)
2520 if (likely(!cfg->move_desc_pending)) 2519 if (likely(!cfg->move_desc_pending))
2521 return; 2520 return;
2522 2521
2523 /* domain is not change, but affinity is changed */ 2522 /* domain has not changed, but affinity did */
2524 me = smp_processor_id(); 2523 me = smp_processor_id();
2525 if (cpu_isset(me, desc->affinity)) { 2524 if (cpu_isset(me, desc->affinity)) {
2526 *descp = desc = move_irq_desc(desc, me); 2525 *descp = desc = move_irq_desc(desc, me);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index fca2991443f5..6383d50f82ea 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -13,12 +13,12 @@
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/ftrace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/io_apic.h> 18#include <asm/io_apic.h>
18#include <asm/idle.h> 19#include <asm/idle.h>
19#include <asm/smp.h> 20#include <asm/smp.h>
20 21
21#ifdef CONFIG_DEBUG_STACKOVERFLOW
22/* 22/*
23 * Probabilistic stack overflow check: 23 * Probabilistic stack overflow check:
24 * 24 *
@@ -28,26 +28,25 @@
28 */ 28 */
29static inline void stack_overflow_check(struct pt_regs *regs) 29static inline void stack_overflow_check(struct pt_regs *regs)
30{ 30{
31#ifdef CONFIG_DEBUG_STACKOVERFLOW
31 u64 curbase = (u64)task_stack_page(current); 32 u64 curbase = (u64)task_stack_page(current);
32 static unsigned long warned = -60*HZ; 33
33 34 WARN_ONCE(regs->sp >= curbase &&
34 if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && 35 regs->sp <= curbase + THREAD_SIZE &&
35 regs->sp < curbase + sizeof(struct thread_info) + 128 && 36 regs->sp < curbase + sizeof(struct thread_info) +
36 time_after(jiffies, warned + 60*HZ)) { 37 sizeof(struct pt_regs) + 128,
37 printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", 38
38 current->comm, curbase, regs->sp); 39 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
39 show_stack(NULL,NULL); 40 current->comm, curbase, regs->sp);
40 warned = jiffies;
41 }
42}
43#endif 41#endif
42}
44 43
45/* 44/*
46 * do_IRQ handles all normal device IRQ's (the special 45 * do_IRQ handles all normal device IRQ's (the special
47 * SMP cross-CPU interrupts have their own specific 46 * SMP cross-CPU interrupts have their own specific
48 * handlers). 47 * handlers).
49 */ 48 */
50asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 49asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
51{ 50{
52 struct pt_regs *old_regs = set_irq_regs(regs); 51 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc; 52 struct irq_desc *desc;
@@ -60,9 +59,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
60 irq_enter(); 59 irq_enter();
61 irq = __get_cpu_var(vector_irq)[vector]; 60 irq = __get_cpu_var(vector_irq)[vector];
62 61
63#ifdef CONFIG_DEBUG_STACKOVERFLOW
64 stack_overflow_check(regs); 62 stack_overflow_check(regs);
65#endif
66 63
67 desc = irq_to_desc(irq); 64 desc = irq_to_desc(irq);
68 if (likely(desc)) 65 if (likely(desc))
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 61aa2a1004b5..84723295f88a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -140,7 +140,7 @@ void __init native_init_IRQ(void)
140 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 140 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
141 /* SYSCALL_VECTOR was reserved in trap_init. */ 141 /* SYSCALL_VECTOR was reserved in trap_init. */
142 if (i != SYSCALL_VECTOR) 142 if (i != SYSCALL_VECTOR)
143 set_intr_gate(i, interrupt[i]); 143 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
144 } 144 }
145 145
146 146
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1020919efe1c..31ebfe38e96c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -24,41 +24,6 @@
24#include <asm/i8259.h> 24#include <asm/i8259.h>
25 25
26/* 26/*
27 * Common place to define all x86 IRQ vectors
28 *
29 * This builds up the IRQ handler stubs using some ugly macros in irq.h
30 *
31 * These macros create the low-level assembly IRQ routines that save
32 * register context and call do_IRQ(). do_IRQ() then does all the
33 * operations that are needed to keep the AT (or SMP IOAPIC)
34 * interrupt-controller happy.
35 */
36
37#define IRQ_NAME2(nr) nr##_interrupt(void)
38#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
39
40/*
41 * SMP has a few special interrupts for IPI messages
42 */
43
44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt\n" \
50 ".previous");
51
52#define BI(x,y) \
53 BUILD_IRQ(x##y)
54
55#define BUILD_16_IRQS(x) \
56 BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
57 BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
58 BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
59 BI(x,c) BI(x,d) BI(x,e) BI(x,f)
60
61/*
62 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 27 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
63 * (these are usually mapped to vectors 0x30-0x3f) 28 * (these are usually mapped to vectors 0x30-0x3f)
64 */ 29 */
@@ -73,37 +38,6 @@
73 * 38 *
74 * (these are usually mapped into the 0x30-0xff vector range) 39 * (these are usually mapped into the 0x30-0xff vector range)
75 */ 40 */
76 BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
77BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
78BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
79BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
80
81#undef BUILD_16_IRQS
82#undef BI
83
84
85#define IRQ(x,y) \
86 IRQ##x##y##_interrupt
87
88#define IRQLIST_16(x) \
89 IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
90 IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
91 IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
92 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
93
94/* for the irq vectors */
95static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
96 IRQLIST_16(0x2), IRQLIST_16(0x3),
97 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
98 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
99 IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
100};
101
102#undef IRQ
103#undef IRQLIST_16
104
105
106
107 41
108/* 42/*
109 * IRQ2 is cascade interrupt to second interrupt controller 43 * IRQ2 is cascade interrupt to second interrupt controller
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 5f8e5d75a254..c25fdb382292 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -10,7 +10,7 @@
10 * This driver allows to upgrade microcode on AMD 10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors. 11 * family 0x10 and 0x11 processors.
12 * 12 *
13 * Licensed unter the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15*/ 15*/
16 16
@@ -32,9 +32,9 @@
32#include <linux/platform_device.h> 32#include <linux/platform_device.h>
33#include <linux/pci.h> 33#include <linux/pci.h>
34#include <linux/pci_ids.h> 34#include <linux/pci_ids.h>
35#include <linux/uaccess.h>
35 36
36#include <asm/msr.h> 37#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <asm/microcode.h> 39#include <asm/microcode.h>
40 40
@@ -47,43 +47,38 @@ MODULE_LICENSE("GPL v2");
47#define UCODE_UCODE_TYPE 0x00000001 47#define UCODE_UCODE_TYPE 0x00000001
48 48
49struct equiv_cpu_entry { 49struct equiv_cpu_entry {
50 unsigned int installed_cpu; 50 u32 installed_cpu;
51 unsigned int fixed_errata_mask; 51 u32 fixed_errata_mask;
52 unsigned int fixed_errata_compare; 52 u32 fixed_errata_compare;
53 unsigned int equiv_cpu; 53 u16 equiv_cpu;
54}; 54 u16 res;
55} __attribute__((packed));
55 56
56struct microcode_header_amd { 57struct microcode_header_amd {
57 unsigned int data_code; 58 u32 data_code;
58 unsigned int patch_id; 59 u32 patch_id;
59 unsigned char mc_patch_data_id[2]; 60 u16 mc_patch_data_id;
60 unsigned char mc_patch_data_len; 61 u8 mc_patch_data_len;
61 unsigned char init_flag; 62 u8 init_flag;
62 unsigned int mc_patch_data_checksum; 63 u32 mc_patch_data_checksum;
63 unsigned int nb_dev_id; 64 u32 nb_dev_id;
64 unsigned int sb_dev_id; 65 u32 sb_dev_id;
65 unsigned char processor_rev_id[2]; 66 u16 processor_rev_id;
66 unsigned char nb_rev_id; 67 u8 nb_rev_id;
67 unsigned char sb_rev_id; 68 u8 sb_rev_id;
68 unsigned char bios_api_rev; 69 u8 bios_api_rev;
69 unsigned char reserved1[3]; 70 u8 reserved1[3];
70 unsigned int match_reg[8]; 71 u32 match_reg[8];
71}; 72} __attribute__((packed));
72 73
73struct microcode_amd { 74struct microcode_amd {
74 struct microcode_header_amd hdr; 75 struct microcode_header_amd hdr;
75 unsigned int mpb[0]; 76 unsigned int mpb[0];
76}; 77};
77 78
78#define UCODE_MAX_SIZE (2048) 79#define UCODE_MAX_SIZE 2048
79#define DEFAULT_UCODE_DATASIZE (896) 80#define UCODE_CONTAINER_SECTION_HDR 8
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) 81#define UCODE_CONTAINER_HEADER_SIZE 12
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87 82
88/* serialize access to the physical write */ 83/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock); 84static DEFINE_SPINLOCK(microcode_update_lock);
@@ -93,31 +88,24 @@ static struct equiv_cpu_entry *equiv_cpu_table;
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 88static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{ 89{
95 struct cpuinfo_x86 *c = &cpu_data(cpu); 90 struct cpuinfo_x86 *c = &cpu_data(cpu);
91 u32 dummy;
96 92
97 memset(csig, 0, sizeof(*csig)); 93 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 94 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", 95 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not "
101 cpu); 96 "supported\n", cpu, c->x86);
102 return -1; 97 return -1;
103 } 98 }
104 99 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
105 asm volatile("movl %1, %%ecx; rdmsr" 100 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev);
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0; 101 return 0;
113} 102}
114 103
115static int get_matching_microcode(int cpu, void *mc, int rev) 104static int get_matching_microcode(int cpu, void *mc, int rev)
116{ 105{
117 struct microcode_header_amd *mc_header = mc; 106 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id; 107 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00; 108 u16 equiv_cpu_id = 0;
121 unsigned int i = 0; 109 unsigned int i = 0;
122 110
123 BUG_ON(equiv_cpu_table == NULL); 111 BUG_ON(equiv_cpu_table == NULL);
@@ -132,57 +120,25 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
132 } 120 }
133 121
134 if (!equiv_cpu_id) { 122 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id " 123 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
136 "not found in equivalent cpu table \n", cpu); 124 "not listed in equivalent cpu table\n", cpu);
137 return 0; 125 return 0;
138 } 126 }
139 127
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { 128 if (mc_header->processor_rev_id != equiv_cpu_id) {
141 printk(KERN_ERR 129 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
142 "microcode: CPU%d patch does not match " 130 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
143 "(patch is %x, cpu extended is %x) \n", 131 cpu, mc_header->processor_rev_id, equiv_cpu_id);
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0; 132 return 0;
147 } 133 }
148 134
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { 135 /* ucode might be chipset specific -- currently we don't support this */
150 printk(KERN_ERR "microcode: CPU%d patch does not match " 136 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
151 "(patch is %x, cpu base id is %x) \n", 137 printk(KERN_ERR "microcode: CPU%d: loading of chipset "
152 cpu, mc_header->processor_rev_id[1], 138 "specific code not yet supported\n", cpu);
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0; 139 return 0;
156 } 140 }
157 141
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev) 142 if (mc_header->patch_id <= rev)
187 return 0; 143 return 0;
188 144
@@ -192,12 +148,10 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
192static void apply_microcode_amd(int cpu) 148static void apply_microcode_amd(int cpu)
193{ 149{
194 unsigned long flags; 150 unsigned long flags;
195 unsigned int eax, edx; 151 u32 rev, dummy;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id(); 152 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; 153 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc; 154 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201 155
202 /* We should bind the task to the CPU */ 156 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu); 157 BUG_ON(cpu_num != cpu);
@@ -206,42 +160,34 @@ static void apply_microcode_amd(int cpu)
206 return; 160 return;
207 161
208 spin_lock_irqsave(&microcode_update_lock, flags); 162 spin_lock_irqsave(&microcode_update_lock, flags);
209 163 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */ 164 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr" 165 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags); 166 spin_unlock_irqrestore(&microcode_update_lock, flags);
223 167
224 /* check current patch id and patch's id for match */ 168 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) { 169 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision " 170 printk(KERN_ERR "microcode: CPU%d: update failed "
227 "0x%x to 0x%x failed\n", cpu_num, 171 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
228 mc_amd->hdr.patch_id, rev);
229 return; 172 return;
230 } 173 }
231 174
232 printk(KERN_INFO "microcode: CPU%d updated from revision " 175 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
233 "0x%x to 0x%x \n", 176 cpu, rev);
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235 177
236 uci->cpu_sig.rev = rev; 178 uci->cpu_sig.rev = rev;
237} 179}
238 180
239static void * get_next_ucode(u8 *buf, unsigned int size, 181static int get_ucode_data(void *to, const u8 *from, size_t n)
240 int (*get_ucode_data)(void *, const void *, size_t), 182{
241 unsigned int *mc_size) 183 memcpy(to, from, n);
184 return 0;
185}
186
187static void *get_next_ucode(const u8 *buf, unsigned int size,
188 unsigned int *mc_size)
242{ 189{
243 unsigned int total_size; 190 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 191 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc; 192 void *mc;
247 193
@@ -249,39 +195,37 @@ static void * get_next_ucode(u8 *buf, unsigned int size,
249 return NULL; 195 return NULL;
250 196
251 if (section_hdr[0] != UCODE_UCODE_TYPE) { 197 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! " 198 printk(KERN_ERR "microcode: error: invalid type field in "
253 "Wrong microcode payload type field\n"); 199 "container file section header\n");
254 return NULL; 200 return NULL;
255 } 201 }
256 202
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 203 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258 204
259 printk(KERN_INFO "microcode: size %u, total_size %u\n", 205 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
260 size, total_size); 206 size, total_size);
261 207
262 if (total_size > size || total_size > UCODE_MAX_SIZE) { 208 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); 209 printk(KERN_ERR "microcode: error: size mismatch\n");
264 return NULL; 210 return NULL;
265 } 211 }
266 212
267 mc = vmalloc(UCODE_MAX_SIZE); 213 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) { 214 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE); 215 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { 216 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
217 total_size)) {
271 vfree(mc); 218 vfree(mc);
272 mc = NULL; 219 mc = NULL;
273 } else 220 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; 221 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 } 222 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc; 223 return mc;
278} 224}
279 225
280 226
281static int install_equiv_cpu_table(u8 *buf, 227static int install_equiv_cpu_table(const u8 *buf)
282 int (*get_ucode_data)(void *, const void *, size_t))
283{ 228{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 229 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr; 230 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size; 231 unsigned long size;
@@ -292,14 +236,15 @@ static int install_equiv_cpu_table(u8 *buf,
292 size = buf_pos[2]; 236 size = buf_pos[2];
293 237
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 238 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! " 239 printk(KERN_ERR "microcode: error: invalid type field in "
296 "Wrong microcode equivalnet cpu table\n"); 240 "container file section header\n");
297 return 0; 241 return 0;
298 } 242 }
299 243
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 244 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) { 245 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); 246 printk(KERN_ERR "microcode: failed to allocate "
247 "equivalent CPU table\n");
303 return 0; 248 return 0;
304 } 249 }
305 250
@@ -310,7 +255,6 @@ static int install_equiv_cpu_table(u8 *buf,
310 } 255 }
311 256
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 257 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314} 258}
315 259
316static void free_equiv_cpu_table(void) 260static void free_equiv_cpu_table(void)
@@ -321,18 +265,20 @@ static void free_equiv_cpu_table(void)
321 } 265 }
322} 266}
323 267
324static int generic_load_microcode(int cpu, void *data, size_t size, 268static int generic_load_microcode(int cpu, const u8 *data, size_t size)
325 int (*get_ucode_data)(void *, const void *, size_t))
326{ 269{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 270 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc; 271 const u8 *ucode_ptr = data;
272 void *new_mc = NULL;
273 void *mc;
329 int new_rev = uci->cpu_sig.rev; 274 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover; 275 unsigned int leftover;
331 unsigned long offset; 276 unsigned long offset;
332 277
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); 278 offset = install_equiv_cpu_table(ucode_ptr);
334 if (!offset) { 279 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); 280 printk(KERN_ERR "microcode: failed to create "
281 "equivalent cpu table\n");
336 return -EINVAL; 282 return -EINVAL;
337 } 283 }
338 284
@@ -343,7 +289,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
343 unsigned int uninitialized_var(mc_size); 289 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header; 290 struct microcode_header_amd *mc_header;
345 291
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); 292 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
347 if (!mc) 293 if (!mc)
348 break; 294 break;
349 295
@@ -353,7 +299,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
353 vfree(new_mc); 299 vfree(new_mc);
354 new_rev = mc_header->patch_id; 300 new_rev = mc_header->patch_id;
355 new_mc = mc; 301 new_mc = mc;
356 } else 302 } else
357 vfree(mc); 303 vfree(mc);
358 304
359 ucode_ptr += mc_size; 305 ucode_ptr += mc_size;
@@ -365,9 +311,9 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
365 if (uci->mc) 311 if (uci->mc)
366 vfree(uci->mc); 312 vfree(uci->mc);
367 uci->mc = new_mc; 313 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with" 314 pr_debug("microcode: CPU%d found a matching microcode "
369 " version 0x%x (current=0x%x)\n", 315 "update with version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev); 316 cpu, new_rev, uci->cpu_sig.rev);
371 } else 317 } else
372 vfree(new_mc); 318 vfree(new_mc);
373 } 319 }
@@ -377,12 +323,6 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
377 return (int)leftover; 323 return (int)leftover;
378} 324}
379 325
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device) 326static int request_microcode_fw(int cpu, struct device *device)
387{ 327{
388 const char *fw_name = "amd-ucode/microcode_amd.bin"; 328 const char *fw_name = "amd-ucode/microcode_amd.bin";
@@ -394,12 +334,11 @@ static int request_microcode_fw(int cpu, struct device *device)
394 334
395 ret = request_firmware(&firmware, fw_name, device); 335 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) { 336 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); 337 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
398 return ret; 338 return ret;
399 } 339 }
400 340
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, 341 ret = generic_load_microcode(cpu, firmware->data, firmware->size);
402 &get_ucode_fw);
403 342
404 release_firmware(firmware); 343 release_firmware(firmware);
405 344
@@ -408,8 +347,8 @@ static int request_microcode_fw(int cpu, struct device *device)
408 347
409static int request_microcode_user(int cpu, const void __user *buf, size_t size) 348static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{ 349{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" 350 printk(KERN_INFO "microcode: AMD microcode update via "
412 "is not supported\n"); 351 "/dev/cpu/microcode not supported\n");
413 return -1; 352 return -1;
414} 353}
415 354
@@ -433,3 +372,4 @@ struct microcode_ops * __init init_amd_microcode(void)
433{ 372{
434 return &microcode_amd_ops; 373 return &microcode_amd_ops;
435} 374}
375
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 82fb2809ce32..c9b721ba968c 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -99,7 +99,7 @@ MODULE_LICENSE("GPL");
99 99
100#define MICROCODE_VERSION "2.00" 100#define MICROCODE_VERSION "2.00"
101 101
102struct microcode_ops *microcode_ops; 102static struct microcode_ops *microcode_ops;
103 103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ 104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex); 105static DEFINE_MUTEX(microcode_mutex);
@@ -203,7 +203,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
203#endif 203#endif
204 204
205/* fake device for request_firmware */ 205/* fake device for request_firmware */
206struct platform_device *microcode_pdev; 206static struct platform_device *microcode_pdev;
207 207
208static ssize_t reload_store(struct sys_device *dev, 208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr, 209 struct sysdev_attribute *attr,
@@ -272,13 +272,18 @@ static struct attribute_group mc_attr_group = {
272 .name = "microcode", 272 .name = "microcode",
273}; 273};
274 274
275static void microcode_fini_cpu(int cpu) 275static void __microcode_fini_cpu(int cpu)
276{ 276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278 278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu); 279 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0; 280 uci->valid = 0;
281}
282
283static void microcode_fini_cpu(int cpu)
284{
285 mutex_lock(&microcode_mutex);
286 __microcode_fini_cpu(cpu);
282 mutex_unlock(&microcode_mutex); 287 mutex_unlock(&microcode_mutex);
283} 288}
284 289
@@ -306,12 +311,16 @@ static int microcode_resume_cpu(int cpu)
306 * to this cpu (a bit of paranoia): 311 * to this cpu (a bit of paranoia):
307 */ 312 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) { 313 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu); 314 __microcode_fini_cpu(cpu);
315 printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
316 cpu);
310 return -1; 317 return -1;
311 } 318 }
312 319
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { 320 if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
314 microcode_fini_cpu(cpu); 321 __microcode_fini_cpu(cpu);
322 printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
323 cpu);
315 /* Should we look for a new ucode here? */ 324 /* Should we look for a new ucode here? */
316 return 1; 325 return 1;
317 } 326 }
@@ -319,7 +328,7 @@ static int microcode_resume_cpu(int cpu)
319 return 0; 328 return 0;
320} 329}
321 330
322void microcode_update_cpu(int cpu) 331static void microcode_update_cpu(int cpu)
323{ 332{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 333 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0; 334 int err = 0;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 622dc4a21784..b7f4c929e615 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -155,6 +155,7 @@ static DEFINE_SPINLOCK(microcode_update_lock);
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) 155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{ 156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num); 157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned long flags;
158 unsigned int val[2]; 159 unsigned int val[2];
159 160
160 memset(csig, 0, sizeof(*csig)); 161 memset(csig, 0, sizeof(*csig));
@@ -174,11 +175,16 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
174 csig->pf = 1 << ((val[1] >> 18) & 7); 175 csig->pf = 1 << ((val[1] >> 18) & 7);
175 } 176 }
176 177
178 /* serialize access to the physical write to MSR 0x79 */
179 spin_lock_irqsave(&microcode_update_lock, flags);
180
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 181 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */ 182 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core(); 183 sync_core();
180 /* get the current revision from MSR 0x8B */ 184 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 185 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
186 spin_unlock_irqrestore(&microcode_update_lock, flags);
187
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", 188 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev); 189 csig->sig, csig->pf, csig->rev);
184 190
@@ -465,7 +471,7 @@ static void microcode_fini_cpu(int cpu)
465 uci->mc = NULL; 471 uci->mc = NULL;
466} 472}
467 473
468struct microcode_ops microcode_intel_ops = { 474static struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user, 475 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw, 476 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info, 477 .collect_cpu_info = collect_cpu_info,
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 13316cf57cdb..8bd1bf9622a7 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -208,12 +208,17 @@ static int __init setup_nmi_watchdog(char *str)
208 ++str; 208 ++str;
209 } 209 }
210 210
211 get_option(&str, &nmi); 211 if (!strncmp(str, "lapic", 5))
212 212 nmi_watchdog = NMI_LOCAL_APIC;
213 if (nmi >= NMI_INVALID) 213 else if (!strncmp(str, "ioapic", 6))
214 return 0; 214 nmi_watchdog = NMI_IO_APIC;
215 else {
216 get_option(&str, &nmi);
217 if (nmi >= NMI_INVALID)
218 return 0;
219 nmi_watchdog = nmi;
220 }
215 221
216 nmi_watchdog = nmi;
217 return 1; 222 return 1;
218} 223}
219__setup("nmi_watchdog=", setup_nmi_watchdog); 224__setup("nmi_watchdog=", setup_nmi_watchdog);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 192624820217..19a1044a0cd9 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -6,6 +6,7 @@
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/iommu.h> 8#include <asm/iommu.h>
9#include <asm/gart.h>
9#include <asm/calgary.h> 10#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 11#include <asm/amd_iommu.h>
11 12
@@ -30,11 +31,6 @@ int no_iommu __read_mostly;
30/* Set this to 1 if there is a HW IOMMU in the system */ 31/* Set this to 1 if there is a HW IOMMU in the system */
31int iommu_detected __read_mostly = 0; 32int iommu_detected __read_mostly = 0;
32 33
33/* This tells the BIO block layer to assume merging. Default to off
34 because we cannot guarantee merging later. */
35int iommu_bio_merge __read_mostly = 0;
36EXPORT_SYMBOL(iommu_bio_merge);
37
38dma_addr_t bad_dma_address __read_mostly = 0; 34dma_addr_t bad_dma_address __read_mostly = 0;
39EXPORT_SYMBOL(bad_dma_address); 35EXPORT_SYMBOL(bad_dma_address);
40 36
@@ -105,11 +101,15 @@ static void __init dma32_free_bootmem(void)
105 dma32_bootmem_ptr = NULL; 101 dma32_bootmem_ptr = NULL;
106 dma32_bootmem_size = 0; 102 dma32_bootmem_size = 0;
107} 103}
104#endif
108 105
109void __init pci_iommu_alloc(void) 106void __init pci_iommu_alloc(void)
110{ 107{
108#ifdef CONFIG_X86_64
111 /* free the range so iommu could get some range less than 4G */ 109 /* free the range so iommu could get some range less than 4G */
112 dma32_free_bootmem(); 110 dma32_free_bootmem();
111#endif
112
113 /* 113 /*
114 * The order of these functions is important for 114 * The order of these functions is important for
115 * fall-back/fail-over reasons 115 * fall-back/fail-over reasons
@@ -125,15 +125,6 @@ void __init pci_iommu_alloc(void)
125 pci_swiotlb_init(); 125 pci_swiotlb_init();
126} 126}
127 127
128unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
129{
130 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
131
132 return size >> PAGE_SHIFT;
133}
134EXPORT_SYMBOL(iommu_nr_pages);
135#endif
136
137void *dma_generic_alloc_coherent(struct device *dev, size_t size, 128void *dma_generic_alloc_coherent(struct device *dev, size_t size,
138 dma_addr_t *dma_addr, gfp_t flag) 129 dma_addr_t *dma_addr, gfp_t flag)
139{ 130{
@@ -188,7 +179,6 @@ static __init int iommu_setup(char *p)
188 } 179 }
189 180
190 if (!strncmp(p, "biomerge", 8)) { 181 if (!strncmp(p, "biomerge", 8)) {
191 iommu_bio_merge = 4096;
192 iommu_merge = 1; 182 iommu_merge = 1;
193 force_iommu = 1; 183 force_iommu = 1;
194 } 184 }
@@ -300,8 +290,8 @@ fs_initcall(pci_iommu_init);
300static __devinit void via_no_dac(struct pci_dev *dev) 290static __devinit void via_no_dac(struct pci_dev *dev)
301{ 291{
302 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { 292 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
303 printk(KERN_INFO "PCI: VIA PCI bridge detected." 293 printk(KERN_INFO
304 "Disabling DAC.\n"); 294 "PCI: VIA PCI bridge detected. Disabling DAC.\n");
305 forbid_dac = 1; 295 forbid_dac = 1;
306 } 296 }
307} 297}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba7ad83e20a8..a35eaa379ff6 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -745,10 +745,8 @@ void __init gart_iommu_init(void)
745 unsigned long scratch; 745 unsigned long scratch;
746 long i; 746 long i;
747 747
748 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { 748 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0)
749 printk(KERN_INFO "PCI-GART: No AMD GART found.\n");
750 return; 749 return;
751 }
752 750
753#ifndef CONFIG_AGP_AMD64 751#ifndef CONFIG_AGP_AMD64
754 no_agp = 1; 752 no_agp = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 3c539d111abb..242c3440687f 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -3,6 +3,8 @@
3#include <linux/pci.h> 3#include <linux/pci.h>
4#include <linux/cache.h> 4#include <linux/cache.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/swiotlb.h>
7#include <linux/bootmem.h>
6#include <linux/dma-mapping.h> 8#include <linux/dma-mapping.h>
7 9
8#include <asm/iommu.h> 10#include <asm/iommu.h>
@@ -11,6 +13,31 @@
11 13
12int swiotlb __read_mostly; 14int swiotlb __read_mostly;
13 15
16void *swiotlb_alloc_boot(size_t size, unsigned long nslabs)
17{
18 return alloc_bootmem_low_pages(size);
19}
20
21void *swiotlb_alloc(unsigned order, unsigned long nslabs)
22{
23 return (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, order);
24}
25
26dma_addr_t swiotlb_phys_to_bus(phys_addr_t paddr)
27{
28 return paddr;
29}
30
31phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
32{
33 return baddr;
34}
35
36int __weak swiotlb_arch_range_needs_mapping(void *ptr, size_t size)
37{
38 return 0;
39}
40
14static dma_addr_t 41static dma_addr_t
15swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, 42swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
16 int direction) 43 int direction)
@@ -50,8 +77,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
50void __init pci_swiotlb_init(void) 77void __init pci_swiotlb_init(void)
51{ 78{
52 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 79 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
80#ifdef CONFIG_X86_64
53 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) 81 if (!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)
54 swiotlb = 1; 82 swiotlb = 1;
83#endif
55 if (swiotlb_force) 84 if (swiotlb_force)
56 swiotlb = 1; 85 swiotlb = 1;
57 if (swiotlb) { 86 if (swiotlb) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 95d811a9594f..e68bb9e30864 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,6 +1,7 @@
1#include <linux/errno.h> 1#include <linux/errno.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4#include <asm/idle.h>
4#include <linux/smp.h> 5#include <linux/smp.h>
5#include <linux/slab.h> 6#include <linux/slab.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
@@ -302,7 +303,7 @@ static void c1e_idle(void)
302 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 303 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
303 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 304 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
304 c1e_detected = 1; 305 c1e_detected = 1;
305 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 306 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
306 mark_tsc_unstable("TSC halt in AMD C1E"); 307 mark_tsc_unstable("TSC halt in AMD C1E");
307 printk(KERN_INFO "System has AMD C1E enabled\n"); 308 printk(KERN_INFO "System has AMD C1E enabled\n");
308 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); 309 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 24c2276aa453..3ba155d24884 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -60,6 +60,7 @@
60#include <asm/idle.h> 60#include <asm/idle.h>
61#include <asm/syscalls.h> 61#include <asm/syscalls.h>
62#include <asm/smp.h> 62#include <asm/smp.h>
63#include <asm/ds.h>
63 64
64asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 65asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
65 66
@@ -251,14 +252,8 @@ void exit_thread(void)
251 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 252 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
252 put_cpu(); 253 put_cpu();
253 } 254 }
254#ifdef CONFIG_X86_DS 255
255 /* Free any DS contexts that have not been properly released. */ 256 ds_exit_thread(current);
256 if (unlikely(current->thread.ds_ctx)) {
257 /* we clear debugctl to make sure DS is not used. */
258 update_debugctlmsr(0);
259 ds_free(current->thread.ds_ctx);
260 }
261#endif /* CONFIG_X86_DS */
262} 257}
263 258
264void flush_thread(void) 259void flush_thread(void)
@@ -340,6 +335,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
340 kfree(p->thread.io_bitmap_ptr); 335 kfree(p->thread.io_bitmap_ptr);
341 p->thread.io_bitmap_max = 0; 336 p->thread.io_bitmap_max = 0;
342 } 337 }
338
339 ds_copy_thread(p, current);
340
341 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
342 p->thread.debugctlmsr = 0;
343
343 return err; 344 return err;
344} 345}
345 346
@@ -420,48 +421,19 @@ int set_tsc_mode(unsigned int val)
420 return 0; 421 return 0;
421} 422}
422 423
423#ifdef CONFIG_X86_DS
424static int update_debugctl(struct thread_struct *prev,
425 struct thread_struct *next, unsigned long debugctl)
426{
427 unsigned long ds_prev = 0;
428 unsigned long ds_next = 0;
429
430 if (prev->ds_ctx)
431 ds_prev = (unsigned long)prev->ds_ctx->ds;
432 if (next->ds_ctx)
433 ds_next = (unsigned long)next->ds_ctx->ds;
434
435 if (ds_next != ds_prev) {
436 /* we clear debugctl to make sure DS
437 * is not in use when we change it */
438 debugctl = 0;
439 update_debugctlmsr(0);
440 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
441 }
442 return debugctl;
443}
444#else
445static int update_debugctl(struct thread_struct *prev,
446 struct thread_struct *next, unsigned long debugctl)
447{
448 return debugctl;
449}
450#endif /* CONFIG_X86_DS */
451
452static noinline void 424static noinline void
453__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 425__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
454 struct tss_struct *tss) 426 struct tss_struct *tss)
455{ 427{
456 struct thread_struct *prev, *next; 428 struct thread_struct *prev, *next;
457 unsigned long debugctl;
458 429
459 prev = &prev_p->thread; 430 prev = &prev_p->thread;
460 next = &next_p->thread; 431 next = &next_p->thread;
461 432
462 debugctl = update_debugctl(prev, next, prev->debugctlmsr); 433 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
463 434 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
464 if (next->debugctlmsr != debugctl) 435 ds_switch_to(prev_p, next_p);
436 else if (next->debugctlmsr != prev->debugctlmsr)
465 update_debugctlmsr(next->debugctlmsr); 437 update_debugctlmsr(next->debugctlmsr);
466 438
467 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 439 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -483,15 +455,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
483 hard_enable_TSC(); 455 hard_enable_TSC();
484 } 456 }
485 457
486#ifdef CONFIG_X86_PTRACE_BTS
487 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
488 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
489
490 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
491 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
492#endif /* CONFIG_X86_PTRACE_BTS */
493
494
495 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 458 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
496 /* 459 /*
497 * Disable the bitmap via an invalid offset. We still cache 460 * Disable the bitmap via an invalid offset. We still cache
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index fbb321d53d34..416fb9282f4f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -53,6 +53,7 @@
53#include <asm/ia32.h> 53#include <asm/ia32.h>
54#include <asm/idle.h> 54#include <asm/idle.h>
55#include <asm/syscalls.h> 55#include <asm/syscalls.h>
56#include <asm/ds.h>
56 57
57asmlinkage extern void ret_from_fork(void); 58asmlinkage extern void ret_from_fork(void);
58 59
@@ -236,14 +237,8 @@ void exit_thread(void)
236 t->io_bitmap_max = 0; 237 t->io_bitmap_max = 0;
237 put_cpu(); 238 put_cpu();
238 } 239 }
239#ifdef CONFIG_X86_DS 240
240 /* Free any DS contexts that have not been properly released. */ 241 ds_exit_thread(current);
241 if (unlikely(t->ds_ctx)) {
242 /* we clear debugctl to make sure DS is not used. */
243 update_debugctlmsr(0);
244 ds_free(t->ds_ctx);
245 }
246#endif /* CONFIG_X86_DS */
247} 242}
248 243
249void flush_thread(void) 244void flush_thread(void)
@@ -373,6 +368,12 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
373 if (err) 368 if (err)
374 goto out; 369 goto out;
375 } 370 }
371
372 ds_copy_thread(p, me);
373
374 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
375 p->thread.debugctlmsr = 0;
376
376 err = 0; 377 err = 0;
377out: 378out:
378 if (err && p->thread.io_bitmap_ptr) { 379 if (err && p->thread.io_bitmap_ptr) {
@@ -471,35 +472,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
471 struct tss_struct *tss) 472 struct tss_struct *tss)
472{ 473{
473 struct thread_struct *prev, *next; 474 struct thread_struct *prev, *next;
474 unsigned long debugctl;
475 475
476 prev = &prev_p->thread, 476 prev = &prev_p->thread,
477 next = &next_p->thread; 477 next = &next_p->thread;
478 478
479 debugctl = prev->debugctlmsr; 479 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
480 480 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
481#ifdef CONFIG_X86_DS 481 ds_switch_to(prev_p, next_p);
482 { 482 else if (next->debugctlmsr != prev->debugctlmsr)
483 unsigned long ds_prev = 0, ds_next = 0;
484
485 if (prev->ds_ctx)
486 ds_prev = (unsigned long)prev->ds_ctx->ds;
487 if (next->ds_ctx)
488 ds_next = (unsigned long)next->ds_ctx->ds;
489
490 if (ds_next != ds_prev) {
491 /*
492 * We clear debugctl to make sure DS
493 * is not in use when we change it:
494 */
495 debugctl = 0;
496 update_debugctlmsr(0);
497 wrmsrl(MSR_IA32_DS_AREA, ds_next);
498 }
499 }
500#endif /* CONFIG_X86_DS */
501
502 if (next->debugctlmsr != debugctl)
503 update_debugctlmsr(next->debugctlmsr); 483 update_debugctlmsr(next->debugctlmsr);
504 484
505 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 485 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
@@ -534,14 +514,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
534 */ 514 */
535 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 515 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
536 } 516 }
537
538#ifdef CONFIG_X86_PTRACE_BTS
539 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
540 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
541
542 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
543 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
544#endif /* CONFIG_X86_PTRACE_BTS */
545} 517}
546 518
547/* 519/*
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2c8ec1ba75e6..0a5df5f82fb9 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -581,153 +581,91 @@ static int ioperm_get(struct task_struct *target,
581} 581}
582 582
583#ifdef CONFIG_X86_PTRACE_BTS 583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index, 584static int ptrace_bts_read_record(struct task_struct *child, size_t index,
664 struct bts_struct __user *out) 585 struct bts_struct __user *out)
665{ 586{
666 struct bts_struct ret; 587 const struct bts_trace *trace;
667 const void *bts_record; 588 struct bts_struct bts;
668 size_t bts_index, bts_end; 589 const unsigned char *at;
669 int error; 590 int error;
670 591
671 error = ds_get_bts_end(child->bts, &bts_end); 592 trace = ds_read_bts(child->bts);
672 if (error < 0) 593 if (!trace)
673 return error; 594 return -EPERM;
674 595
675 if (bts_end <= index) 596 at = trace->ds.top - ((index + 1) * trace->ds.size);
676 return -EINVAL; 597 if ((void *)at < trace->ds.begin)
598 at += (trace->ds.n * trace->ds.size);
677 599
678 error = ds_get_bts_index(child->bts, &bts_index); 600 if (!trace->read)
679 if (error < 0) 601 return -EOPNOTSUPP;
680 return error;
681
682 /* translate the ptrace bts index into the ds bts index */
683 bts_index += bts_end - (index + 1);
684 if (bts_end <= bts_index)
685 bts_index -= bts_end;
686 602
687 error = ds_access_bts(child->bts, bts_index, &bts_record); 603 error = trace->read(child->bts, at, &bts);
688 if (error < 0) 604 if (error < 0)
689 return error; 605 return error;
690 606
691 ptrace_bts_translate_record(&ret, bts_record); 607 if (copy_to_user(out, &bts, sizeof(bts)))
692
693 if (copy_to_user(out, &ret, sizeof(ret)))
694 return -EFAULT; 608 return -EFAULT;
695 609
696 return sizeof(ret); 610 return sizeof(bts);
697} 611}
698 612
699static int ptrace_bts_drain(struct task_struct *child, 613static int ptrace_bts_drain(struct task_struct *child,
700 long size, 614 long size,
701 struct bts_struct __user *out) 615 struct bts_struct __user *out)
702{ 616{
703 struct bts_struct ret; 617 const struct bts_trace *trace;
704 const unsigned char *raw; 618 const unsigned char *at;
705 size_t end, i; 619 int error, drained = 0;
706 int error;
707 620
708 error = ds_get_bts_index(child->bts, &end); 621 trace = ds_read_bts(child->bts);
709 if (error < 0) 622 if (!trace)
710 return error; 623 return -EPERM;
711 624
712 if (size < (end * sizeof(struct bts_struct))) 625 if (!trace->read)
626 return -EOPNOTSUPP;
627
628 if (size < (trace->ds.top - trace->ds.begin))
713 return -EIO; 629 return -EIO;
714 630
715 error = ds_access_bts(child->bts, 0, (const void **)&raw); 631 for (at = trace->ds.begin; (void *)at < trace->ds.top;
716 if (error < 0) 632 out++, drained++, at += trace->ds.size) {
717 return error; 633 struct bts_struct bts;
634 int error;
718 635
719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { 636 error = trace->read(child->bts, at, &bts);
720 ptrace_bts_translate_record(&ret, raw); 637 if (error < 0)
638 return error;
721 639
722 if (copy_to_user(out, &ret, sizeof(ret))) 640 if (copy_to_user(out, &bts, sizeof(bts)))
723 return -EFAULT; 641 return -EFAULT;
724 } 642 }
725 643
726 error = ds_clear_bts(child->bts); 644 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
645
646 error = ds_reset_bts(child->bts);
727 if (error < 0) 647 if (error < 0)
728 return error; 648 return error;
729 649
730 return end; 650 return drained;
651}
652
653static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
654{
655 child->bts_buffer = alloc_locked_buffer(size);
656 if (!child->bts_buffer)
657 return -ENOMEM;
658
659 child->bts_size = size;
660
661 return 0;
662}
663
664static void ptrace_bts_free_buffer(struct task_struct *child)
665{
666 free_locked_buffer(child->bts_buffer, child->bts_size);
667 child->bts_buffer = NULL;
668 child->bts_size = 0;
731} 669}
732 670
733static int ptrace_bts_config(struct task_struct *child, 671static int ptrace_bts_config(struct task_struct *child,
@@ -735,136 +673,86 @@ static int ptrace_bts_config(struct task_struct *child,
735 const struct ptrace_bts_config __user *ucfg) 673 const struct ptrace_bts_config __user *ucfg)
736{ 674{
737 struct ptrace_bts_config cfg; 675 struct ptrace_bts_config cfg;
738 int error = 0; 676 unsigned int flags = 0;
739
740 error = -EOPNOTSUPP;
741 if (!bts_cfg.sizeof_bts)
742 goto errout;
743 677
744 error = -EIO;
745 if (cfg_size < sizeof(cfg)) 678 if (cfg_size < sizeof(cfg))
746 goto errout; 679 return -EIO;
747 680
748 error = -EFAULT;
749 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 681 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
750 goto errout; 682 return -EFAULT;
751
752 error = -EINVAL;
753 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
754 !(cfg.flags & PTRACE_BTS_O_ALLOC))
755 goto errout;
756
757 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
758 bts_ovfl_callback_t ovfl = NULL;
759 unsigned int sig = 0;
760
761 error = -EINVAL;
762 if (cfg.size < (10 * bts_cfg.sizeof_bts))
763 goto errout;
764 683
765 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 684 if (child->bts) {
766 if (!cfg.signal) 685 ds_release_bts(child->bts);
767 goto errout; 686 child->bts = NULL;
687 }
768 688
769 error = -EOPNOTSUPP; 689 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 goto errout; 690 if (!cfg.signal)
691 return -EINVAL;
771 692
772 sig = cfg.signal; 693 return -EOPNOTSUPP;
773 }
774 694
775 if (child->bts) { 695 child->thread.bts_ovfl_signal = cfg.signal;
776 (void)ds_release_bts(child->bts); 696 }
777 kfree(child->bts_buffer);
778 697
779 child->bts = NULL; 698 if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
780 child->bts_buffer = NULL; 699 (cfg.size != child->bts_size)) {
781 } 700 int error;
782 701
783 error = -ENOMEM; 702 ptrace_bts_free_buffer(child);
784 child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
785 if (!child->bts_buffer)
786 goto errout;
787
788 child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
789 ovfl, /* th = */ (size_t)-1);
790 if (IS_ERR(child->bts)) {
791 error = PTR_ERR(child->bts);
792 kfree(child->bts_buffer);
793 child->bts = NULL;
794 child->bts_buffer = NULL;
795 goto errout;
796 }
797 703
798 child->thread.bts_ovfl_signal = sig; 704 error = ptrace_bts_allocate_buffer(child, cfg.size);
705 if (error < 0)
706 return error;
799 } 707 }
800 708
801 error = -EINVAL;
802 if (!child->thread.ds_ctx && cfg.flags)
803 goto errout;
804
805 if (cfg.flags & PTRACE_BTS_O_TRACE) 709 if (cfg.flags & PTRACE_BTS_O_TRACE)
806 child->thread.debugctlmsr |= bts_cfg.debugctl_mask; 710 flags |= BTS_USER;
807 else
808 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
809 711
810 if (cfg.flags & PTRACE_BTS_O_SCHED) 712 if (cfg.flags & PTRACE_BTS_O_SCHED)
811 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 713 flags |= BTS_TIMESTAMPS;
812 else
813 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
814 714
815 error = sizeof(cfg); 715 child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
716 /* ovfl = */ NULL, /* th = */ (size_t)-1,
717 flags);
718 if (IS_ERR(child->bts)) {
719 int error = PTR_ERR(child->bts);
816 720
817out: 721 ptrace_bts_free_buffer(child);
818 if (child->thread.debugctlmsr) 722 child->bts = NULL;
819 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
820 else
821 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
822 723
823 return error; 724 return error;
725 }
824 726
825errout: 727 return sizeof(cfg);
826 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
827 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
828 goto out;
829} 728}
830 729
831static int ptrace_bts_status(struct task_struct *child, 730static int ptrace_bts_status(struct task_struct *child,
832 long cfg_size, 731 long cfg_size,
833 struct ptrace_bts_config __user *ucfg) 732 struct ptrace_bts_config __user *ucfg)
834{ 733{
734 const struct bts_trace *trace;
835 struct ptrace_bts_config cfg; 735 struct ptrace_bts_config cfg;
836 size_t end;
837 const void *base, *max;
838 int error;
839 736
840 if (cfg_size < sizeof(cfg)) 737 if (cfg_size < sizeof(cfg))
841 return -EIO; 738 return -EIO;
842 739
843 error = ds_get_bts_end(child->bts, &end); 740 trace = ds_read_bts(child->bts);
844 if (error < 0) 741 if (!trace)
845 return error; 742 return -EPERM;
846
847 error = ds_access_bts(child->bts, /* index = */ 0, &base);
848 if (error < 0)
849 return error;
850
851 error = ds_access_bts(child->bts, /* index = */ end, &max);
852 if (error < 0)
853 return error;
854 743
855 memset(&cfg, 0, sizeof(cfg)); 744 memset(&cfg, 0, sizeof(cfg));
856 cfg.size = (max - base); 745 cfg.size = trace->ds.end - trace->ds.begin;
857 cfg.signal = child->thread.bts_ovfl_signal; 746 cfg.signal = child->thread.bts_ovfl_signal;
858 cfg.bts_size = sizeof(struct bts_struct); 747 cfg.bts_size = sizeof(struct bts_struct);
859 748
860 if (cfg.signal) 749 if (cfg.signal)
861 cfg.flags |= PTRACE_BTS_O_SIGNAL; 750 cfg.flags |= PTRACE_BTS_O_SIGNAL;
862 751
863 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 752 if (trace->ds.flags & BTS_USER)
864 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
865 cfg.flags |= PTRACE_BTS_O_TRACE; 753 cfg.flags |= PTRACE_BTS_O_TRACE;
866 754
867 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 755 if (trace->ds.flags & BTS_TIMESTAMPS)
868 cfg.flags |= PTRACE_BTS_O_SCHED; 756 cfg.flags |= PTRACE_BTS_O_SCHED;
869 757
870 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 758 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
@@ -873,106 +761,77 @@ static int ptrace_bts_status(struct task_struct *child,
873 return sizeof(cfg); 761 return sizeof(cfg);
874} 762}
875 763
876static int ptrace_bts_write_record(struct task_struct *child, 764static int ptrace_bts_clear(struct task_struct *child)
877 const struct bts_struct *in)
878{ 765{
879 unsigned char bts_record[BTS_MAX_RECORD_SIZE]; 766 const struct bts_trace *trace;
880 767
881 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); 768 trace = ds_read_bts(child->bts);
769 if (!trace)
770 return -EPERM;
882 771
883 memset(bts_record, 0, bts_cfg.sizeof_bts); 772 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
884 switch (in->qualifier) {
885 case BTS_INVALID:
886 break;
887 773
888 case BTS_BRANCH: 774 return ds_reset_bts(child->bts);
889 bts_set(bts_record, bts_from, in->variant.lbr.from_ip); 775}
890 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
891 break;
892 776
893 case BTS_TASK_ARRIVES: 777static int ptrace_bts_size(struct task_struct *child)
894 case BTS_TASK_DEPARTS: 778{
895 bts_set(bts_record, bts_from, bts_escape); 779 const struct bts_trace *trace;
896 bts_set(bts_record, bts_qual, in->qualifier);
897 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
898 break;
899 780
900 default: 781 trace = ds_read_bts(child->bts);
901 return -EINVAL; 782 if (!trace)
902 } 783 return -EPERM;
903 784
904 return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts); 785 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
905} 786}
906 787
907void ptrace_bts_take_timestamp(struct task_struct *tsk, 788static void ptrace_bts_fork(struct task_struct *tsk)
908 enum bts_qualifier qualifier)
909{ 789{
910 struct bts_struct rec = { 790 tsk->bts = NULL;
911 .qualifier = qualifier, 791 tsk->bts_buffer = NULL;
912 .variant.jiffies = jiffies_64 792 tsk->bts_size = 0;
913 }; 793 tsk->thread.bts_ovfl_signal = 0;
914
915 ptrace_bts_write_record(tsk, &rec);
916} 794}
917 795
918static const struct bts_configuration bts_cfg_netburst = { 796static void ptrace_bts_untrace(struct task_struct *child)
919 .sizeof_bts = sizeof(long) * 3, 797{
920 .sizeof_field = sizeof(long), 798 if (unlikely(child->bts)) {
921 .debugctl_mask = (1<<2)|(1<<3)|(1<<5) 799 ds_release_bts(child->bts);
922}; 800 child->bts = NULL;
923 801
924static const struct bts_configuration bts_cfg_pentium_m = { 802 /* We cannot update total_vm and locked_vm since
925 .sizeof_bts = sizeof(long) * 3, 803 child's mm is already gone. But we can reclaim the
926 .sizeof_field = sizeof(long), 804 memory. */
927 .debugctl_mask = (1<<6)|(1<<7) 805 kfree(child->bts_buffer);
928}; 806 child->bts_buffer = NULL;
807 child->bts_size = 0;
808 }
809}
929 810
930static const struct bts_configuration bts_cfg_core2 = { 811static void ptrace_bts_detach(struct task_struct *child)
931 .sizeof_bts = 8 * 3, 812{
932 .sizeof_field = 8, 813 if (unlikely(child->bts)) {
933 .debugctl_mask = (1<<6)|(1<<7)|(1<<9) 814 ds_release_bts(child->bts);
934}; 815 child->bts = NULL;
935 816
936static inline void bts_configure(const struct bts_configuration *cfg) 817 ptrace_bts_free_buffer(child);
818 }
819}
820#else
821static inline void ptrace_bts_fork(struct task_struct *tsk) {}
822static inline void ptrace_bts_detach(struct task_struct *child) {}
823static inline void ptrace_bts_untrace(struct task_struct *child) {}
824#endif /* CONFIG_X86_PTRACE_BTS */
825
826void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
937{ 827{
938 bts_cfg = *cfg; 828 ptrace_bts_fork(child);
939} 829}
940 830
941void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) 831void x86_ptrace_untrace(struct task_struct *child)
942{ 832{
943 switch (c->x86) { 833 ptrace_bts_untrace(child);
944 case 0x6:
945 switch (c->x86_model) {
946 case 0 ... 0xC:
947 /* sorry, don't know about them */
948 break;
949 case 0xD:
950 case 0xE: /* Pentium M */
951 bts_configure(&bts_cfg_pentium_m);
952 break;
953 default: /* Core2, Atom, ... */
954 bts_configure(&bts_cfg_core2);
955 break;
956 }
957 break;
958 case 0xF:
959 switch (c->x86_model) {
960 case 0x0:
961 case 0x1:
962 case 0x2: /* Netburst */
963 bts_configure(&bts_cfg_netburst);
964 break;
965 default:
966 /* sorry, don't know about them */
967 break;
968 }
969 break;
970 default:
971 /* sorry, don't know about them */
972 break;
973 }
974} 834}
975#endif /* CONFIG_X86_PTRACE_BTS */
976 835
977/* 836/*
978 * Called by kernel/ptrace.c when detaching.. 837 * Called by kernel/ptrace.c when detaching..
@@ -985,19 +844,7 @@ void ptrace_disable(struct task_struct *child)
985#ifdef TIF_SYSCALL_EMU 844#ifdef TIF_SYSCALL_EMU
986 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 845 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
987#endif 846#endif
988#ifdef CONFIG_X86_PTRACE_BTS 847 ptrace_bts_detach(child);
989 if (child->bts) {
990 (void)ds_release_bts(child->bts);
991 kfree(child->bts_buffer);
992 child->bts_buffer = NULL;
993
994 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
995 if (!child->thread.debugctlmsr)
996 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
997
998 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
999 }
1000#endif /* CONFIG_X86_PTRACE_BTS */
1001} 848}
1002 849
1003#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1128,16 +975,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1128 (child, data, (struct ptrace_bts_config __user *)addr); 975 (child, data, (struct ptrace_bts_config __user *)addr);
1129 break; 976 break;
1130 977
1131 case PTRACE_BTS_SIZE: { 978 case PTRACE_BTS_SIZE:
1132 size_t size; 979 ret = ptrace_bts_size(child);
1133
1134 ret = ds_get_bts_index(child->bts, &size);
1135 if (ret == 0) {
1136 BUG_ON(size != (int) size);
1137 ret = (int) size;
1138 }
1139 break; 980 break;
1140 }
1141 981
1142 case PTRACE_BTS_GET: 982 case PTRACE_BTS_GET:
1143 ret = ptrace_bts_read_record 983 ret = ptrace_bts_read_record
@@ -1145,7 +985,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1145 break; 985 break;
1146 986
1147 case PTRACE_BTS_CLEAR: 987 case PTRACE_BTS_CLEAR:
1148 ret = ds_clear_bts(child->bts); 988 ret = ptrace_bts_clear(child);
1149 break; 989 break;
1150 990
1151 case PTRACE_BTS_DRAIN: 991 case PTRACE_BTS_DRAIN:
@@ -1408,6 +1248,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1408 1248
1409 case PTRACE_GET_THREAD_AREA: 1249 case PTRACE_GET_THREAD_AREA:
1410 case PTRACE_SET_THREAD_AREA: 1250 case PTRACE_SET_THREAD_AREA:
1251#ifdef CONFIG_X86_PTRACE_BTS
1252 case PTRACE_BTS_CONFIG:
1253 case PTRACE_BTS_STATUS:
1254 case PTRACE_BTS_SIZE:
1255 case PTRACE_BTS_GET:
1256 case PTRACE_BTS_CLEAR:
1257 case PTRACE_BTS_DRAIN:
1258#endif /* CONFIG_X86_PTRACE_BTS */
1411 return arch_ptrace(child, request, addr, data); 1259 return arch_ptrace(child, request, addr, data);
1412 1260
1413 default: 1261 default:
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 67465ed89310..309949e9e1c1 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -168,6 +168,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
168 ich_force_enable_hpet); 168 ich_force_enable_hpet);
169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 169DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
170 ich_force_enable_hpet); 170 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4,
172 ich_force_enable_hpet);
171DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, 173DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
172 ich_force_enable_hpet); 174 ich_force_enable_hpet);
173 175
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 32fe42f26e79..ae0d8042cf69 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -93,11 +93,13 @@
93#include <asm/desc.h> 93#include <asm/desc.h>
94#include <asm/dma.h> 94#include <asm/dma.h>
95#include <asm/iommu.h> 95#include <asm/iommu.h>
96#include <asm/gart.h>
96#include <asm/mmu_context.h> 97#include <asm/mmu_context.h>
97#include <asm/proto.h> 98#include <asm/proto.h>
98 99
99#include <mach_apic.h> 100#include <mach_apic.h>
100#include <asm/paravirt.h> 101#include <asm/paravirt.h>
102#include <asm/hypervisor.h>
101 103
102#include <asm/percpu.h> 104#include <asm/percpu.h>
103#include <asm/topology.h> 105#include <asm/topology.h>
@@ -448,6 +450,7 @@ static void __init reserve_early_setup_data(void)
448 * @size: Size of the crashkernel memory to reserve. 450 * @size: Size of the crashkernel memory to reserve.
449 * Returns the base address on success, and -1ULL on failure. 451 * Returns the base address on success, and -1ULL on failure.
450 */ 452 */
453static
451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) 454unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
452{ 455{
453 const unsigned long long alignment = 16<<20; /* 16M */ 456 const unsigned long long alignment = 16<<20; /* 16M */
@@ -600,157 +603,7 @@ static struct x86_quirks default_x86_quirks __initdata = {
600 603
601struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 604struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
602 605
603/* 606#ifdef CONFIG_X86_RESERVE_LOW_64K
604 * Some BIOSes seem to corrupt the low 64k of memory during events
605 * like suspend/resume and unplugging an HDMI cable. Reserve all
606 * remaining free memory in that area and fill it with a distinct
607 * pattern.
608 */
609#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
610#define MAX_SCAN_AREAS 8
611
612static int __read_mostly memory_corruption_check = -1;
613
614static unsigned __read_mostly corruption_check_size = 64*1024;
615static unsigned __read_mostly corruption_check_period = 60; /* seconds */
616
617static struct e820entry scan_areas[MAX_SCAN_AREAS];
618static int num_scan_areas;
619
620
621static int set_corruption_check(char *arg)
622{
623 char *end;
624
625 memory_corruption_check = simple_strtol(arg, &end, 10);
626
627 return (*end == 0) ? 0 : -EINVAL;
628}
629early_param("memory_corruption_check", set_corruption_check);
630
631static int set_corruption_check_period(char *arg)
632{
633 char *end;
634
635 corruption_check_period = simple_strtoul(arg, &end, 10);
636
637 return (*end == 0) ? 0 : -EINVAL;
638}
639early_param("memory_corruption_check_period", set_corruption_check_period);
640
641static int set_corruption_check_size(char *arg)
642{
643 char *end;
644 unsigned size;
645
646 size = memparse(arg, &end);
647
648 if (*end == '\0')
649 corruption_check_size = size;
650
651 return (size == corruption_check_size) ? 0 : -EINVAL;
652}
653early_param("memory_corruption_check_size", set_corruption_check_size);
654
655
656static void __init setup_bios_corruption_check(void)
657{
658 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
659
660 if (memory_corruption_check == -1) {
661 memory_corruption_check =
662#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
663 1
664#else
665 0
666#endif
667 ;
668 }
669
670 if (corruption_check_size == 0)
671 memory_corruption_check = 0;
672
673 if (!memory_corruption_check)
674 return;
675
676 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
677
678 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
679 u64 size;
680 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
681
682 if (addr == 0)
683 break;
684
685 if ((addr + size) > corruption_check_size)
686 size = corruption_check_size - addr;
687
688 if (size == 0)
689 break;
690
691 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
692 scan_areas[num_scan_areas].addr = addr;
693 scan_areas[num_scan_areas].size = size;
694 num_scan_areas++;
695
696 /* Assume we've already mapped this early memory */
697 memset(__va(addr), 0, size);
698
699 addr += size;
700 }
701
702 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
703 num_scan_areas);
704 update_e820();
705}
706
707static struct timer_list periodic_check_timer;
708
709void check_for_bios_corruption(void)
710{
711 int i;
712 int corruption = 0;
713
714 if (!memory_corruption_check)
715 return;
716
717 for(i = 0; i < num_scan_areas; i++) {
718 unsigned long *addr = __va(scan_areas[i].addr);
719 unsigned long size = scan_areas[i].size;
720
721 for(; size; addr++, size -= sizeof(unsigned long)) {
722 if (!*addr)
723 continue;
724 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
725 addr, __pa(addr), *addr);
726 corruption = 1;
727 *addr = 0;
728 }
729 }
730
731 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
732}
733
734static void periodic_check_for_corruption(unsigned long data)
735{
736 check_for_bios_corruption();
737 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
738}
739
740void start_periodic_check_for_corruption(void)
741{
742 if (!memory_corruption_check || corruption_check_period == 0)
743 return;
744
745 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
746 corruption_check_period);
747
748 init_timer(&periodic_check_timer);
749 periodic_check_timer.function = &periodic_check_for_corruption;
750 periodic_check_for_corruption(0);
751}
752#endif
753
754static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 607static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
755{ 608{
756 printk(KERN_NOTICE 609 printk(KERN_NOTICE
@@ -762,6 +615,7 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
762 615
763 return 0; 616 return 0;
764} 617}
618#endif
765 619
766/* List of systems that have known low memory corruption BIOS problems */ 620/* List of systems that have known low memory corruption BIOS problems */
767static struct dmi_system_id __initdata bad_bios_dmi_table[] = { 621static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
@@ -920,6 +774,12 @@ void __init setup_arch(char **cmdline_p)
920 774
921 dmi_check_system(bad_bios_dmi_table); 775 dmi_check_system(bad_bios_dmi_table);
922 776
777 /*
778 * VMware detection requires dmi to be available, so this
779 * needs to be done after dmi_scan_machine, for the BP.
780 */
781 init_hypervisor(&boot_cpu_data);
782
923#ifdef CONFIG_X86_32 783#ifdef CONFIG_X86_32
924 probe_roms(); 784 probe_roms();
925#endif 785#endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0b63b08e7530..49f3f709ee1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -339,25 +339,25 @@ static const cpumask_t cpu_mask_none;
339/* 339/*
340 * Returns a pointer to the bitmask of CPUs on Node 'node'. 340 * Returns a pointer to the bitmask of CPUs on Node 'node'.
341 */ 341 */
342const cpumask_t *_node_to_cpumask_ptr(int node) 342const cpumask_t *cpumask_of_node(int node)
343{ 343{
344 if (node_to_cpumask_map == NULL) { 344 if (node_to_cpumask_map == NULL) {
345 printk(KERN_WARNING 345 printk(KERN_WARNING
346 "_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n", 346 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
347 node); 347 node);
348 dump_stack(); 348 dump_stack();
349 return (const cpumask_t *)&cpu_online_map; 349 return (const cpumask_t *)&cpu_online_map;
350 } 350 }
351 if (node >= nr_node_ids) { 351 if (node >= nr_node_ids) {
352 printk(KERN_WARNING 352 printk(KERN_WARNING
353 "_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n", 353 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
354 node, nr_node_ids); 354 node, nr_node_ids);
355 dump_stack(); 355 dump_stack();
356 return &cpu_mask_none; 356 return &cpu_mask_none;
357 } 357 }
358 return &node_to_cpumask_map[node]; 358 return &node_to_cpumask_map[node];
359} 359}
360EXPORT_SYMBOL(_node_to_cpumask_ptr); 360EXPORT_SYMBOL(cpumask_of_node);
361 361
362/* 362/*
363 * Returns a bitmask of CPUs on Node 'node'. 363 * Returns a bitmask of CPUs on Node 'node'.
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
deleted file mode 100644
index cc673aa55ce4..000000000000
--- a/arch/x86/kernel/sigframe.h
+++ /dev/null
@@ -1,42 +0,0 @@
1#ifdef CONFIG_X86_32
2struct sigframe {
3 char __user *pretcode;
4 int sig;
5 struct sigcontext sc;
6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
15 unsigned long extramask[_NSIG_WORDS-1];
16 char retcode[8];
17 /* fp state follows here */
18};
19
20struct rt_sigframe {
21 char __user *pretcode;
22 int sig;
23 struct siginfo __user *pinfo;
24 void __user *puc;
25 struct siginfo info;
26 struct ucontext uc;
27 char retcode[8];
28 /* fp state follows here */
29};
30#else
31struct rt_sigframe {
32 char __user *pretcode;
33 struct ucontext uc;
34 struct siginfo info;
35 /* fp state follows here */
36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal.c
index d6dd057d0f22..89bb7668041d 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal.c
@@ -1,36 +1,41 @@
1/* 1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
3 * 4 *
4 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson 5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
5 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
6 */ 8 */
7#include <linux/list.h>
8 9
9#include <linux/personality.h> 10#include <linux/sched.h>
10#include <linux/binfmts.h> 11#include <linux/mm.h>
11#include <linux/suspend.h> 12#include <linux/smp.h>
12#include <linux/kernel.h> 13#include <linux/kernel.h>
13#include <linux/ptrace.h>
14#include <linux/signal.h> 14#include <linux/signal.h>
15#include <linux/stddef.h>
16#include <linux/unistd.h>
17#include <linux/errno.h> 15#include <linux/errno.h>
18#include <linux/sched.h>
19#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h>
20#include <linux/tracehook.h> 18#include <linux/tracehook.h>
21#include <linux/elf.h> 19#include <linux/unistd.h>
22#include <linux/smp.h> 20#include <linux/stddef.h>
23#include <linux/mm.h> 21#include <linux/personality.h>
22#include <linux/uaccess.h>
24 23
25#include <asm/processor.h> 24#include <asm/processor.h>
26#include <asm/ucontext.h> 25#include <asm/ucontext.h>
27#include <asm/uaccess.h>
28#include <asm/i387.h> 26#include <asm/i387.h>
29#include <asm/vdso.h> 27#include <asm/vdso.h>
28
29#ifdef CONFIG_X86_64
30#include <asm/proto.h>
31#include <asm/ia32_unistd.h>
32#include <asm/mce.h>
33#endif /* CONFIG_X86_64 */
34
30#include <asm/syscall.h> 35#include <asm/syscall.h>
31#include <asm/syscalls.h> 36#include <asm/syscalls.h>
32 37
33#include "sigframe.h" 38#include <asm/sigframe.h>
34 39
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 40#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36 41
@@ -45,74 +50,6 @@
45# define FIX_EFLAGS __FIX_EFLAGS 50# define FIX_EFLAGS __FIX_EFLAGS
46#endif 51#endif
47 52
48/*
49 * Atomically swap in the new signal mask, and wait for a signal.
50 */
51asmlinkage int
52sys_sigsuspend(int history0, int history1, old_sigset_t mask)
53{
54 mask &= _BLOCKABLE;
55 spin_lock_irq(&current->sighand->siglock);
56 current->saved_sigmask = current->blocked;
57 siginitset(&current->blocked, mask);
58 recalc_sigpending();
59 spin_unlock_irq(&current->sighand->siglock);
60
61 current->state = TASK_INTERRUPTIBLE;
62 schedule();
63 set_restore_sigmask();
64
65 return -ERESTARTNOHAND;
66}
67
68asmlinkage int
69sys_sigaction(int sig, const struct old_sigaction __user *act,
70 struct old_sigaction __user *oact)
71{
72 struct k_sigaction new_ka, old_ka;
73 int ret;
74
75 if (act) {
76 old_sigset_t mask;
77
78 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
79 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
80 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
81 return -EFAULT;
82
83 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
84 __get_user(mask, &act->sa_mask);
85 siginitset(&new_ka.sa.sa_mask, mask);
86 }
87
88 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
89
90 if (!ret && oact) {
91 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
92 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
93 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
94 return -EFAULT;
95
96 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
97 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
98 }
99
100 return ret;
101}
102
103asmlinkage int sys_sigaltstack(unsigned long bx)
104{
105 /*
106 * This is needed to make gcc realize it doesn't own the
107 * "struct pt_regs"
108 */
109 struct pt_regs *regs = (struct pt_regs *)&bx;
110 const stack_t __user *uss = (const stack_t __user *)bx;
111 stack_t __user *uoss = (stack_t __user *)regs->cx;
112
113 return do_sigaltstack(uss, uoss, regs->sp);
114}
115
116#define COPY(x) { \ 53#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \ 54 err |= __get_user(regs->x, &sc->x); \
118} 55}
@@ -123,7 +60,7 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
123 regs->seg = tmp; \ 60 regs->seg = tmp; \
124} 61}
125 62
126#define COPY_SEG_STRICT(seg) { \ 63#define COPY_SEG_CPL3(seg) { \
127 unsigned short tmp; \ 64 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \ 65 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \ 66 regs->seg = tmp | 3; \
@@ -135,9 +72,6 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
135 loadsegment(seg, tmp); \ 72 loadsegment(seg, tmp); \
136} 73}
137 74
138/*
139 * Do a signal return; undo the signal stack.
140 */
141static int 75static int
142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 76restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
143 unsigned long *pax) 77 unsigned long *pax)
@@ -149,14 +83,36 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 /* Always make any pending restarted system calls return -EINTR */ 83 /* Always make any pending restarted system calls return -EINTR */
150 current_thread_info()->restart_block.fn = do_no_restart_syscall; 84 current_thread_info()->restart_block.fn = do_no_restart_syscall;
151 85
86#ifdef CONFIG_X86_32
152 GET_SEG(gs); 87 GET_SEG(gs);
153 COPY_SEG(fs); 88 COPY_SEG(fs);
154 COPY_SEG(es); 89 COPY_SEG(es);
155 COPY_SEG(ds); 90 COPY_SEG(ds);
91#endif /* CONFIG_X86_32 */
92
156 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 93 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
157 COPY(dx); COPY(cx); COPY(ip); 94 COPY(dx); COPY(cx); COPY(ip);
158 COPY_SEG_STRICT(cs); 95
159 COPY_SEG_STRICT(ss); 96#ifdef CONFIG_X86_64
97 COPY(r8);
98 COPY(r9);
99 COPY(r10);
100 COPY(r11);
101 COPY(r12);
102 COPY(r13);
103 COPY(r14);
104 COPY(r15);
105#endif /* CONFIG_X86_64 */
106
107#ifdef CONFIG_X86_32
108 COPY_SEG_CPL3(cs);
109 COPY_SEG_CPL3(ss);
110#else /* !CONFIG_X86_32 */
111 /* Kernel saves and restores only the CS segment register on signals,
112 * which is the bare minimum needed to allow mixed 32/64-bit code.
113 * App's signal handler can save/restore other segments if needed. */
114 COPY_SEG_CPL3(cs);
115#endif /* CONFIG_X86_32 */
160 116
161 err |= __get_user(tmpflags, &sc->flags); 117 err |= __get_user(tmpflags, &sc->flags);
162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 118 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -169,102 +125,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
169 return err; 125 return err;
170} 126}
171 127
172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
173{
174 struct sigframe __user *frame;
175 struct pt_regs *regs;
176 unsigned long ax;
177 sigset_t set;
178
179 regs = (struct pt_regs *) &__unused;
180 frame = (struct sigframe __user *)(regs->sp - 8);
181
182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
183 goto badframe;
184 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
185 && __copy_from_user(&set.sig[1], &frame->extramask,
186 sizeof(frame->extramask))))
187 goto badframe;
188
189 sigdelsetmask(&set, ~_BLOCKABLE);
190 spin_lock_irq(&current->sighand->siglock);
191 current->blocked = set;
192 recalc_sigpending();
193 spin_unlock_irq(&current->sighand->siglock);
194
195 if (restore_sigcontext(regs, &frame->sc, &ax))
196 goto badframe;
197 return ax;
198
199badframe:
200 if (show_unhandled_signals && printk_ratelimit()) {
201 printk("%s%s[%d] bad frame in sigreturn frame:"
202 "%p ip:%lx sp:%lx oeax:%lx",
203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
204 current->comm, task_pid_nr(current), frame, regs->ip,
205 regs->sp, regs->orig_ax);
206 print_vma_addr(" in ", regs->ip);
207 printk(KERN_CONT "\n");
208 }
209
210 force_sig(SIGSEGV, current);
211
212 return 0;
213}
214
215static long do_rt_sigreturn(struct pt_regs *regs)
216{
217 struct rt_sigframe __user *frame;
218 unsigned long ax;
219 sigset_t set;
220
221 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
222 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
223 goto badframe;
224 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
225 goto badframe;
226
227 sigdelsetmask(&set, ~_BLOCKABLE);
228 spin_lock_irq(&current->sighand->siglock);
229 current->blocked = set;
230 recalc_sigpending();
231 spin_unlock_irq(&current->sighand->siglock);
232
233 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
234 goto badframe;
235
236 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
237 goto badframe;
238
239 return ax;
240
241badframe:
242 signal_fault(regs, frame, "rt_sigreturn");
243 return 0;
244}
245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
253/*
254 * Set up a signal frame.
255 */
256static int 128static int
257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 129setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
258 struct pt_regs *regs, unsigned long mask) 130 struct pt_regs *regs, unsigned long mask)
259{ 131{
260 int tmp, err = 0; 132 int err = 0;
261 133
262 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); 134#ifdef CONFIG_X86_32
263 savesegment(gs, tmp); 135 {
264 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 136 unsigned int tmp;
265 137
138 savesegment(gs, tmp);
139 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
140 }
141 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
266 err |= __put_user(regs->es, (unsigned int __user *)&sc->es); 142 err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
267 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); 143 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
144#endif /* CONFIG_X86_32 */
145
268 err |= __put_user(regs->di, &sc->di); 146 err |= __put_user(regs->di, &sc->di);
269 err |= __put_user(regs->si, &sc->si); 147 err |= __put_user(regs->si, &sc->si);
270 err |= __put_user(regs->bp, &sc->bp); 148 err |= __put_user(regs->bp, &sc->bp);
@@ -273,19 +151,33 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
273 err |= __put_user(regs->dx, &sc->dx); 151 err |= __put_user(regs->dx, &sc->dx);
274 err |= __put_user(regs->cx, &sc->cx); 152 err |= __put_user(regs->cx, &sc->cx);
275 err |= __put_user(regs->ax, &sc->ax); 153 err |= __put_user(regs->ax, &sc->ax);
154#ifdef CONFIG_X86_64
155 err |= __put_user(regs->r8, &sc->r8);
156 err |= __put_user(regs->r9, &sc->r9);
157 err |= __put_user(regs->r10, &sc->r10);
158 err |= __put_user(regs->r11, &sc->r11);
159 err |= __put_user(regs->r12, &sc->r12);
160 err |= __put_user(regs->r13, &sc->r13);
161 err |= __put_user(regs->r14, &sc->r14);
162 err |= __put_user(regs->r15, &sc->r15);
163#endif /* CONFIG_X86_64 */
164
276 err |= __put_user(current->thread.trap_no, &sc->trapno); 165 err |= __put_user(current->thread.trap_no, &sc->trapno);
277 err |= __put_user(current->thread.error_code, &sc->err); 166 err |= __put_user(current->thread.error_code, &sc->err);
278 err |= __put_user(regs->ip, &sc->ip); 167 err |= __put_user(regs->ip, &sc->ip);
168#ifdef CONFIG_X86_32
279 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); 169 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
280 err |= __put_user(regs->flags, &sc->flags); 170 err |= __put_user(regs->flags, &sc->flags);
281 err |= __put_user(regs->sp, &sc->sp_at_signal); 171 err |= __put_user(regs->sp, &sc->sp_at_signal);
282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 172 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
173#else /* !CONFIG_X86_32 */
174 err |= __put_user(regs->flags, &sc->flags);
175 err |= __put_user(regs->cs, &sc->cs);
176 err |= __put_user(0, &sc->gs);
177 err |= __put_user(0, &sc->fs);
178#endif /* CONFIG_X86_32 */
283 179
284 tmp = save_i387_xstate(fpstate); 180 err |= __put_user(fpstate, &sc->fpstate);
285 if (tmp < 0)
286 err = 1;
287 else
288 err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
289 181
290 /* non-iBCS2 extensions.. */ 182 /* non-iBCS2 extensions.. */
291 err |= __put_user(mask, &sc->oldmask); 183 err |= __put_user(mask, &sc->oldmask);
@@ -295,6 +187,32 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
295} 187}
296 188
297/* 189/*
190 * Set up a signal frame.
191 */
192#ifdef CONFIG_X86_32
193static const struct {
194 u16 poplmovl;
195 u32 val;
196 u16 int80;
197} __attribute__((packed)) retcode = {
198 0xb858, /* popl %eax; movl $..., %eax */
199 __NR_sigreturn,
200 0x80cd, /* int $0x80 */
201};
202
203static const struct {
204 u8 movl;
205 u32 val;
206 u16 int80;
207 u8 pad;
208} __attribute__((packed)) rt_retcode = {
209 0xb8, /* movl $..., %eax */
210 __NR_rt_sigreturn,
211 0x80cd, /* int $0x80 */
212 0
213};
214
215/*
298 * Determine which stack to use.. 216 * Determine which stack to use..
299 */ 217 */
300static inline void __user * 218static inline void __user *
@@ -328,6 +246,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
328 if (used_math()) { 246 if (used_math()) {
329 sp = sp - sig_xstate_size; 247 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp; 248 *fpstate = (struct _fpstate *) sp;
249 if (save_i387_xstate(*fpstate) < 0)
250 return (void __user *)-1L;
331 } 251 }
332 252
333 sp -= frame_size; 253 sp -= frame_size;
@@ -383,9 +303,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
383 * reasons and because gdb uses it as a signature to notice 303 * reasons and because gdb uses it as a signature to notice
384 * signal handler stack frames. 304 * signal handler stack frames.
385 */ 305 */
386 err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); 306 err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode);
387 err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
389 307
390 if (err) 308 if (err)
391 return -EFAULT; 309 return -EFAULT;
@@ -454,9 +372,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
454 * reasons and because gdb uses it as a signature to notice 372 * reasons and because gdb uses it as a signature to notice
455 * signal handler stack frames. 373 * signal handler stack frames.
456 */ 374 */
457 err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); 375 err |= __put_user(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
458 err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
460 376
461 if (err) 377 if (err)
462 return -EFAULT; 378 return -EFAULT;
@@ -475,23 +391,293 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
475 391
476 return 0; 392 return 0;
477} 393}
394#else /* !CONFIG_X86_32 */
395/*
396 * Determine which stack to use..
397 */
398static void __user *
399get_stack(struct k_sigaction *ka, unsigned long sp, unsigned long size)
400{
401 /* Default to using normal stack - redzone*/
402 sp -= 128;
403
404 /* This is the X/Open sanctioned signal stack switching. */
405 if (ka->sa.sa_flags & SA_ONSTACK) {
406 if (sas_ss_flags(sp) == 0)
407 sp = current->sas_ss_sp + current->sas_ss_size;
408 }
409
410 return (void __user *)round_down(sp - size, 64);
411}
412
413static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
414 sigset_t *set, struct pt_regs *regs)
415{
416 struct rt_sigframe __user *frame;
417 void __user *fp = NULL;
418 int err = 0;
419 struct task_struct *me = current;
420
421 if (used_math()) {
422 fp = get_stack(ka, regs->sp, sig_xstate_size);
423 frame = (void __user *)round_down(
424 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
425
426 if (save_i387_xstate(fp) < 0)
427 return -EFAULT;
428 } else
429 frame = get_stack(ka, regs->sp, sizeof(struct rt_sigframe)) - 8;
430
431 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
432 return -EFAULT;
433
434 if (ka->sa.sa_flags & SA_SIGINFO) {
435 if (copy_siginfo_to_user(&frame->info, info))
436 return -EFAULT;
437 }
438
439 /* Create the ucontext. */
440 if (cpu_has_xsave)
441 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
442 else
443 err |= __put_user(0, &frame->uc.uc_flags);
444 err |= __put_user(0, &frame->uc.uc_link);
445 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
446 err |= __put_user(sas_ss_flags(regs->sp),
447 &frame->uc.uc_stack.ss_flags);
448 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
449 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
450 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
451
452 /* Set up to return from userspace. If provided, use a stub
453 already in userspace. */
454 /* x86-64 should always use SA_RESTORER. */
455 if (ka->sa.sa_flags & SA_RESTORER) {
456 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
457 } else {
458 /* could use a vstub here */
459 return -EFAULT;
460 }
461
462 if (err)
463 return -EFAULT;
464
465 /* Set up registers for signal handler */
466 regs->di = sig;
467 /* In case the signal handler was declared without prototypes */
468 regs->ax = 0;
469
470 /* This also works for non SA_SIGINFO handlers because they expect the
471 next argument after the signal number on the stack. */
472 regs->si = (unsigned long)&frame->info;
473 regs->dx = (unsigned long)&frame->uc;
474 regs->ip = (unsigned long) ka->sa.sa_handler;
475
476 regs->sp = (unsigned long)frame;
477
478 /* Set up the CS register to run signal handlers in 64-bit mode,
479 even if the handler happens to be interrupting 32-bit code. */
480 regs->cs = __USER_CS;
481
482 return 0;
483}
484#endif /* CONFIG_X86_32 */
485
486#ifdef CONFIG_X86_32
487/*
488 * Atomically swap in the new signal mask, and wait for a signal.
489 */
490asmlinkage int
491sys_sigsuspend(int history0, int history1, old_sigset_t mask)
492{
493 mask &= _BLOCKABLE;
494 spin_lock_irq(&current->sighand->siglock);
495 current->saved_sigmask = current->blocked;
496 siginitset(&current->blocked, mask);
497 recalc_sigpending();
498 spin_unlock_irq(&current->sighand->siglock);
499
500 current->state = TASK_INTERRUPTIBLE;
501 schedule();
502 set_restore_sigmask();
503
504 return -ERESTARTNOHAND;
505}
506
507asmlinkage int
508sys_sigaction(int sig, const struct old_sigaction __user *act,
509 struct old_sigaction __user *oact)
510{
511 struct k_sigaction new_ka, old_ka;
512 int ret;
513
514 if (act) {
515 old_sigset_t mask;
516
517 if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
518 __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
519 __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
520 return -EFAULT;
521
522 __get_user(new_ka.sa.sa_flags, &act->sa_flags);
523 __get_user(mask, &act->sa_mask);
524 siginitset(&new_ka.sa.sa_mask, mask);
525 }
526
527 ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
528
529 if (!ret && oact) {
530 if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
531 __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
532 __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
533 return -EFAULT;
534
535 __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
536 __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
537 }
538
539 return ret;
540}
541#endif /* CONFIG_X86_32 */
542
543#ifdef CONFIG_X86_32
544asmlinkage int sys_sigaltstack(unsigned long bx)
545{
546 /*
547 * This is needed to make gcc realize it doesn't own the
548 * "struct pt_regs"
549 */
550 struct pt_regs *regs = (struct pt_regs *)&bx;
551 const stack_t __user *uss = (const stack_t __user *)bx;
552 stack_t __user *uoss = (stack_t __user *)regs->cx;
553
554 return do_sigaltstack(uss, uoss, regs->sp);
555}
556#else /* !CONFIG_X86_32 */
557asmlinkage long
558sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
559 struct pt_regs *regs)
560{
561 return do_sigaltstack(uss, uoss, regs->sp);
562}
563#endif /* CONFIG_X86_32 */
564
565/*
566 * Do a signal return; undo the signal stack.
567 */
568#ifdef CONFIG_X86_32
569asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
570{
571 struct sigframe __user *frame;
572 struct pt_regs *regs;
573 unsigned long ax;
574 sigset_t set;
575
576 regs = (struct pt_regs *) &__unused;
577 frame = (struct sigframe __user *)(regs->sp - 8);
578
579 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
580 goto badframe;
581 if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
582 && __copy_from_user(&set.sig[1], &frame->extramask,
583 sizeof(frame->extramask))))
584 goto badframe;
585
586 sigdelsetmask(&set, ~_BLOCKABLE);
587 spin_lock_irq(&current->sighand->siglock);
588 current->blocked = set;
589 recalc_sigpending();
590 spin_unlock_irq(&current->sighand->siglock);
591
592 if (restore_sigcontext(regs, &frame->sc, &ax))
593 goto badframe;
594 return ax;
595
596badframe:
597 signal_fault(regs, frame, "sigreturn");
598
599 return 0;
600}
601#endif /* CONFIG_X86_32 */
602
603static long do_rt_sigreturn(struct pt_regs *regs)
604{
605 struct rt_sigframe __user *frame;
606 unsigned long ax;
607 sigset_t set;
608
609 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
610 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
611 goto badframe;
612 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
613 goto badframe;
614
615 sigdelsetmask(&set, ~_BLOCKABLE);
616 spin_lock_irq(&current->sighand->siglock);
617 current->blocked = set;
618 recalc_sigpending();
619 spin_unlock_irq(&current->sighand->siglock);
620
621 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
622 goto badframe;
623
624 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
625 goto badframe;
626
627 return ax;
628
629badframe:
630 signal_fault(regs, frame, "rt_sigreturn");
631 return 0;
632}
633
634#ifdef CONFIG_X86_32
635asmlinkage int sys_rt_sigreturn(struct pt_regs regs)
636{
637 return do_rt_sigreturn(&regs);
638}
639#else /* !CONFIG_X86_32 */
640asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
641{
642 return do_rt_sigreturn(regs);
643}
644#endif /* CONFIG_X86_32 */
478 645
479/* 646/*
480 * OK, we're invoking a handler: 647 * OK, we're invoking a handler:
481 */ 648 */
482static int signr_convert(int sig) 649static int signr_convert(int sig)
483{ 650{
651#ifdef CONFIG_X86_32
484 struct thread_info *info = current_thread_info(); 652 struct thread_info *info = current_thread_info();
485 653
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) 654 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig]; 655 return info->exec_domain->signal_invmap[sig];
656#endif /* CONFIG_X86_32 */
488 return sig; 657 return sig;
489} 658}
490 659
660#ifdef CONFIG_X86_32
661
491#define is_ia32 1 662#define is_ia32 1
492#define ia32_setup_frame __setup_frame 663#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame 664#define ia32_setup_rt_frame __setup_rt_frame
494 665
666#else /* !CONFIG_X86_32 */
667
668#ifdef CONFIG_IA32_EMULATION
669#define is_ia32 test_thread_flag(TIF_IA32)
670#else /* !CONFIG_IA32_EMULATION */
671#define is_ia32 0
672#endif /* CONFIG_IA32_EMULATION */
673
674int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
675 sigset_t *set, struct pt_regs *regs);
676int ia32_setup_frame(int sig, struct k_sigaction *ka,
677 sigset_t *set, struct pt_regs *regs);
678
679#endif /* CONFIG_X86_32 */
680
495static int 681static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 682setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs) 683 sigset_t *set, struct pt_regs *regs)
@@ -592,7 +778,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
592 return 0; 778 return 0;
593} 779}
594 780
781#ifdef CONFIG_X86_32
595#define NR_restart_syscall __NR_restart_syscall 782#define NR_restart_syscall __NR_restart_syscall
783#else /* !CONFIG_X86_32 */
784#define NR_restart_syscall \
785 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
786#endif /* CONFIG_X86_32 */
787
596/* 788/*
597 * Note that 'init' is a special process: it doesn't get signals it doesn't 789 * Note that 'init' is a special process: it doesn't get signals it doesn't
598 * want to handle. Thus you cannot kill init even with a SIGKILL even by 790 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -704,8 +896,9 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
704 struct task_struct *me = current; 896 struct task_struct *me = current;
705 897
706 if (show_unhandled_signals && printk_ratelimit()) { 898 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO 899 printk("%s"
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 900 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
901 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
709 me->comm, me->pid, where, frame, 902 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax); 903 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip); 904 print_vma_addr(" in ", regs->ip);
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
deleted file mode 100644
index a5c9627f4db9..000000000000
--- a/arch/x86/kernel/signal_64.c
+++ /dev/null
@@ -1,516 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
4 *
5 * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen
8 */
9
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/kernel.h>
14#include <linux/signal.h>
15#include <linux/errno.h>
16#include <linux/wait.h>
17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
19#include <linux/unistd.h>
20#include <linux/stddef.h>
21#include <linux/personality.h>
22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
25#include <asm/processor.h>
26#include <asm/ucontext.h>
27#include <asm/i387.h>
28#include <asm/proto.h>
29#include <asm/ia32_unistd.h>
30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
33#include "sigframe.h"
34
35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
36
37#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
38 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
39 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
40 X86_EFLAGS_CF)
41
42#ifdef CONFIG_X86_32
43# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
44#else
45# define FIX_EFLAGS __FIX_EFLAGS
46#endif
47
48asmlinkage long
49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
50 struct pt_regs *regs)
51{
52 return do_sigaltstack(uss, uoss, regs->sp);
53}
54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
64
65/*
66 * Do a signal return; undo the signal stack.
67 */
68static int
69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
70 unsigned long *pax)
71{
72 void __user *buf;
73 unsigned int tmpflags;
74 unsigned int err = 0;
75
76 /* Always make any pending restarted system calls return -EINTR */
77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
78
79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
80 COPY(dx); COPY(cx); COPY(ip);
81 COPY(r8);
82 COPY(r9);
83 COPY(r10);
84 COPY(r11);
85 COPY(r12);
86 COPY(r13);
87 COPY(r14);
88 COPY(r15);
89
90 /* Kernel saves and restores only the CS segment register on signals,
91 * which is the bare minimum needed to allow mixed 32/64-bit code.
92 * App's signal handler can save/restore other segments if needed. */
93 COPY_SEG_STRICT(cs);
94
95 err |= __get_user(tmpflags, &sc->flags);
96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
97 regs->orig_ax = -1; /* disable syscall checks */
98
99 err |= __get_user(buf, &sc->fpstate);
100 err |= restore_i387_xstate(buf);
101
102 err |= __get_user(*pax, &sc->ax);
103 return err;
104}
105
106static long do_rt_sigreturn(struct pt_regs *regs)
107{
108 struct rt_sigframe __user *frame;
109 unsigned long ax;
110 sigset_t set;
111
112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
114 goto badframe;
115 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
116 goto badframe;
117
118 sigdelsetmask(&set, ~_BLOCKABLE);
119 spin_lock_irq(&current->sighand->siglock);
120 current->blocked = set;
121 recalc_sigpending();
122 spin_unlock_irq(&current->sighand->siglock);
123
124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
125 goto badframe;
126
127 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
128 goto badframe;
129
130 return ax;
131
132badframe:
133 signal_fault(regs, frame, "rt_sigreturn");
134 return 0;
135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
141
142/*
143 * Set up a signal frame.
144 */
145
146static inline int
147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
149{
150 int err = 0;
151
152 err |= __put_user(regs->cs, &sc->cs);
153 err |= __put_user(0, &sc->gs);
154 err |= __put_user(0, &sc->fs);
155
156 err |= __put_user(regs->di, &sc->di);
157 err |= __put_user(regs->si, &sc->si);
158 err |= __put_user(regs->bp, &sc->bp);
159 err |= __put_user(regs->sp, &sc->sp);
160 err |= __put_user(regs->bx, &sc->bx);
161 err |= __put_user(regs->dx, &sc->dx);
162 err |= __put_user(regs->cx, &sc->cx);
163 err |= __put_user(regs->ax, &sc->ax);
164 err |= __put_user(regs->r8, &sc->r8);
165 err |= __put_user(regs->r9, &sc->r9);
166 err |= __put_user(regs->r10, &sc->r10);
167 err |= __put_user(regs->r11, &sc->r11);
168 err |= __put_user(regs->r12, &sc->r12);
169 err |= __put_user(regs->r13, &sc->r13);
170 err |= __put_user(regs->r14, &sc->r14);
171 err |= __put_user(regs->r15, &sc->r15);
172 err |= __put_user(me->thread.trap_no, &sc->trapno);
173 err |= __put_user(me->thread.error_code, &sc->err);
174 err |= __put_user(regs->ip, &sc->ip);
175 err |= __put_user(regs->flags, &sc->flags);
176 err |= __put_user(mask, &sc->oldmask);
177 err |= __put_user(me->thread.cr2, &sc->cr2);
178
179 return err;
180}
181
182/*
183 * Determine which stack to use..
184 */
185
186static void __user *
187get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
188{
189 unsigned long sp;
190
191 /* Default to using normal stack - redzone*/
192 sp = regs->sp - 128;
193
194 /* This is the X/Open sanctioned signal stack switching. */
195 if (ka->sa.sa_flags & SA_ONSTACK) {
196 if (sas_ss_flags(sp) == 0)
197 sp = current->sas_ss_sp + current->sas_ss_size;
198 }
199
200 return (void __user *)round_down(sp - size, 64);
201}
202
203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
204 sigset_t *set, struct pt_regs *regs)
205{
206 struct rt_sigframe __user *frame;
207 void __user *fp = NULL;
208 int err = 0;
209 struct task_struct *me = current;
210
211 if (used_math()) {
212 fp = get_stack(ka, regs, sig_xstate_size);
213 frame = (void __user *)round_down(
214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
215
216 if (save_i387_xstate(fp) < 0)
217 return -EFAULT;
218 } else
219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
220
221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
222 return -EFAULT;
223
224 if (ka->sa.sa_flags & SA_SIGINFO) {
225 if (copy_siginfo_to_user(&frame->info, info))
226 return -EFAULT;
227 }
228
229 /* Create the ucontext. */
230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
234 err |= __put_user(0, &frame->uc.uc_link);
235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
236 err |= __put_user(sas_ss_flags(regs->sp),
237 &frame->uc.uc_stack.ss_flags);
238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
241 if (sizeof(*set) == 16) {
242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
244 } else
245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
246
247 /* Set up to return from userspace. If provided, use a stub
248 already in userspace. */
249 /* x86-64 should always use SA_RESTORER. */
250 if (ka->sa.sa_flags & SA_RESTORER) {
251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
252 } else {
253 /* could use a vstub here */
254 return -EFAULT;
255 }
256
257 if (err)
258 return -EFAULT;
259
260 /* Set up registers for signal handler */
261 regs->di = sig;
262 /* In case the signal handler was declared without prototypes */
263 regs->ax = 0;
264
265 /* This also works for non SA_SIGINFO handlers because they expect the
266 next argument after the signal number on the stack. */
267 regs->si = (unsigned long)&frame->info;
268 regs->dx = (unsigned long)&frame->uc;
269 regs->ip = (unsigned long) ka->sa.sa_handler;
270
271 regs->sp = (unsigned long)frame;
272
273 /* Set up the CS register to run signal handlers in 64-bit mode,
274 even if the handler happens to be interrupting 32-bit code. */
275 regs->cs = __USER_CS;
276
277 return 0;
278}
279
280/*
281 * OK, we're invoking a handler
282 */
283static int signr_convert(int sig)
284{
285 return sig;
286}
287
288#ifdef CONFIG_IA32_EMULATION
289#define is_ia32 test_thread_flag(TIF_IA32)
290#else
291#define is_ia32 0
292#endif
293
294static int
295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
317
318static int
319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
320 sigset_t *oldset, struct pt_regs *regs)
321{
322 int ret;
323
324 /* Are we from a system call? */
325 if (syscall_get_nr(current, regs) >= 0) {
326 /* If so, check system call restarting.. */
327 switch (syscall_get_error(current, regs)) {
328 case -ERESTART_RESTARTBLOCK:
329 case -ERESTARTNOHAND:
330 regs->ax = -EINTR;
331 break;
332
333 case -ERESTARTSYS:
334 if (!(ka->sa.sa_flags & SA_RESTART)) {
335 regs->ax = -EINTR;
336 break;
337 }
338 /* fallthrough */
339 case -ERESTARTNOINTR:
340 regs->ax = regs->orig_ax;
341 regs->ip -= 2;
342 break;
343 }
344 }
345
346 /*
347 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
348 * flag so that register information in the sigcontext is correct.
349 */
350 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
352 regs->flags &= ~X86_EFLAGS_TF;
353
354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
355
356 if (ret)
357 return ret;
358
359#ifdef CONFIG_X86_64
360 /*
361 * This has nothing to do with segment registers,
362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
367
368 /*
369 * Clear the direction flag as per the ABI for function entry.
370 */
371 regs->flags &= ~X86_EFLAGS_DF;
372
373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
392}
393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
396/*
397 * Note that 'init' is a special process: it doesn't get signals it doesn't
398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
399 * mistake.
400 */
401static void do_signal(struct pt_regs *regs)
402{
403 struct k_sigaction ka;
404 siginfo_t info;
405 int signr;
406 sigset_t *oldset;
407
408 /*
409 * We want the common case to go fast, which is why we may in certain
410 * cases get here from kernel mode. Just return without doing anything
411 * if so.
412 * X86_32: vm86 regs switched out by assembly code before reaching
413 * here, so testing against kernel CS suffices.
414 */
415 if (!user_mode(regs))
416 return;
417
418 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
419 oldset = &current->saved_sigmask;
420 else
421 oldset = &current->blocked;
422
423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
424 if (signr > 0) {
425 /*
426 * Re-enable any watchpoints before delivering the
427 * signal to user space. The processor register will
428 * have been cleared if the watchpoint triggered
429 * inside the kernel.
430 */
431 if (current->thread.debugreg7)
432 set_debugreg(current->thread.debugreg7, 7);
433
434 /* Whee! Actually deliver the signal. */
435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
436 /*
437 * A signal was successfully delivered; the saved
438 * sigmask will have been stored in the signal frame,
439 * and will be restored by sigreturn, so we can simply
440 * clear the TS_RESTORE_SIGMASK flag.
441 */
442 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
443 }
444 return;
445 }
446
447 /* Did we come from a system call? */
448 if (syscall_get_nr(current, regs) >= 0) {
449 /* Restart the system call - no handlers present */
450 switch (syscall_get_error(current, regs)) {
451 case -ERESTARTNOHAND:
452 case -ERESTARTSYS:
453 case -ERESTARTNOINTR:
454 regs->ax = regs->orig_ax;
455 regs->ip -= 2;
456 break;
457
458 case -ERESTART_RESTARTBLOCK:
459 regs->ax = NR_restart_syscall;
460 regs->ip -= 2;
461 break;
462 }
463 }
464
465 /*
466 * If there's no signal to deliver, we just put the saved sigmask
467 * back.
468 */
469 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
470 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
471 sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
472 }
473}
474
475/*
476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
481{
482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
483 /* notify userspace of pending MCEs */
484 if (thread_info_flags & _TIF_MCE_NOTIFY)
485 mce_notify_user();
486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
487
488 /* deal with pending signal delivery */
489 if (thread_info_flags & _TIF_SIGPENDING)
490 do_signal(regs);
491
492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
500}
501
502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
503{
504 struct task_struct *me = current;
505
506 if (show_unhandled_signals && printk_ratelimit()) {
507 printk(KERN_INFO
508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
511 print_vma_addr(" in ", regs->ip);
512 printk(KERN_CONT "\n");
513 }
514
515 force_sig(SIGSEGV, me);
516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 49ed667b06f3..beea2649a240 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -165,11 +165,7 @@ static void native_smp_send_stop(void)
165void smp_reschedule_interrupt(struct pt_regs *regs) 165void smp_reschedule_interrupt(struct pt_regs *regs)
166{ 166{
167 ack_APIC_irq(); 167 ack_APIC_irq();
168#ifdef CONFIG_X86_32 168 inc_irq_stat(irq_resched_count);
169 __get_cpu_var(irq_stat).irq_resched_count++;
170#else
171 add_pda(irq_resched_count, 1);
172#endif
173} 169}
174 170
175void smp_call_function_interrupt(struct pt_regs *regs) 171void smp_call_function_interrupt(struct pt_regs *regs)
@@ -177,11 +173,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
177 ack_APIC_irq(); 173 ack_APIC_irq();
178 irq_enter(); 174 irq_enter();
179 generic_smp_call_function_interrupt(); 175 generic_smp_call_function_interrupt();
180#ifdef CONFIG_X86_32 176 inc_irq_stat(irq_call_count);
181 __get_cpu_var(irq_stat).irq_call_count++;
182#else
183 add_pda(irq_call_count, 1);
184#endif
185 irq_exit(); 177 irq_exit();
186} 178}
187 179
@@ -190,11 +182,7 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
190 ack_APIC_irq(); 182 ack_APIC_irq();
191 irq_enter(); 183 irq_enter();
192 generic_smp_call_function_single_interrupt(); 184 generic_smp_call_function_single_interrupt();
193#ifdef CONFIG_X86_32 185 inc_irq_stat(irq_call_count);
194 __get_cpu_var(irq_stat).irq_call_count++;
195#else
196 add_pda(irq_call_count, 1);
197#endif
198 irq_exit(); 186 irq_exit();
199} 187}
200 188
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 1a9941b11150..9e177a4077ee 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -282,7 +282,7 @@ static int __cpuinitdata unsafe_smp;
282/* 282/*
283 * Activate a secondary processor. 283 * Activate a secondary processor.
284 */ 284 */
285static void __cpuinit start_secondary(void *unused) 285notrace static void __cpuinit start_secondary(void *unused)
286{ 286{
287 /* 287 /*
288 * Don't put *anything* before cpu_init(), SMP booting is too 288 * Don't put *anything* before cpu_init(), SMP booting is too
@@ -496,7 +496,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
496} 496}
497 497
498/* maps the cpu to the sched domain representing multi-core */ 498/* maps the cpu to the sched domain representing multi-core */
499cpumask_t cpu_coregroup_map(int cpu) 499const struct cpumask *cpu_coregroup_mask(int cpu)
500{ 500{
501 struct cpuinfo_x86 *c = &cpu_data(cpu); 501 struct cpuinfo_x86 *c = &cpu_data(cpu);
502 /* 502 /*
@@ -504,9 +504,14 @@ cpumask_t cpu_coregroup_map(int cpu)
504 * And for power savings, we return cpu_core_map 504 * And for power savings, we return cpu_core_map
505 */ 505 */
506 if (sched_mc_power_savings || sched_smt_power_savings) 506 if (sched_mc_power_savings || sched_smt_power_savings)
507 return per_cpu(cpu_core_map, cpu); 507 return &per_cpu(cpu_core_map, cpu);
508 else 508 else
509 return c->llc_shared_map; 509 return &c->llc_shared_map;
510}
511
512cpumask_t cpu_coregroup_map(int cpu)
513{
514 return *cpu_coregroup_mask(cpu);
510} 515}
511 516
512static void impress_friends(void) 517static void impress_friends(void)
@@ -1075,8 +1080,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
1075#endif 1080#endif
1076 1081
1077 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1082 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1078 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1083 printk(KERN_WARNING
1079 "by the BIOS.\n", hard_smp_processor_id()); 1084 "weird, boot CPU (#%d) not listed by the BIOS.\n",
1085 hard_smp_processor_id());
1086
1080 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 1087 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1081 } 1088 }
1082 1089
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 77b400f06ea2..65309e4cb1c0 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(profile_pc);
75irqreturn_t timer_interrupt(int irq, void *dev_id) 75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{ 76{
77 /* Keep nmi watchdog up to date */ 77 /* Keep nmi watchdog up to date */
78 per_cpu(irq_stat, smp_processor_id()).irq0_irqs++; 78 inc_irq_stat(irq0_irqs);
79 79
80#ifdef CONFIG_X86_IO_APIC 80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) { 81 if (timer_ack) {
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index cb19d650c216..891e7a7c4334 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -49,9 +49,9 @@ unsigned long profile_pc(struct pt_regs *regs)
49} 49}
50EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
51 51
52irqreturn_t timer_interrupt(int irq, void *dev_id) 52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{ 53{
54 add_pda(irq0_irqs, 1); 54 inc_irq_stat(irq0_irqs);
55 55
56 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
57 57
@@ -80,6 +80,8 @@ unsigned long __init calibrate_cpu(void)
80 break; 80 break;
81 no_ctr_free = (i == 4); 81 no_ctr_free = (i == 4);
82 if (no_ctr_free) { 82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
83 i = 3; 85 i = 3;
84 rdmsrl(MSR_K7_EVNTSEL3, evntsel3); 86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
85 wrmsrl(MSR_K7_EVNTSEL3, 0); 87 wrmsrl(MSR_K7_EVNTSEL3, 0);
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 174ea90d1cbd..ce5054642247 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,9 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
34 */ 34 */
35void leave_mm(int cpu) 35void leave_mm(int cpu)
36{ 36{
37 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 BUG(); 38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
40 load_cr3(swapper_pg_dir); 39 load_cr3(swapper_pg_dir);
41} 40}
42EXPORT_SYMBOL_GPL(leave_mm); 41EXPORT_SYMBOL_GPL(leave_mm);
@@ -104,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
104 * BUG(); 103 * BUG();
105 */ 104 */
106 105
107 if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { 106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
108 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { 107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
109 if (flush_va == TLB_FLUSH_ALL) 108 if (flush_va == TLB_FLUSH_ALL)
110 local_flush_tlb(); 109 local_flush_tlb();
111 else 110 else
@@ -119,7 +118,7 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
119 smp_mb__after_clear_bit(); 118 smp_mb__after_clear_bit();
120out: 119out:
121 put_cpu_no_resched(); 120 put_cpu_no_resched();
122 __get_cpu_var(irq_stat).irq_tlb_count++; 121 inc_irq_stat(irq_tlb_count);
123} 122}
124 123
125void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
@@ -238,7 +237,7 @@ static void do_flush_tlb_all(void *info)
238 unsigned long cpu = smp_processor_id(); 237 unsigned long cpu = smp_processor_id();
239 238
240 __flush_tlb_all(); 239 __flush_tlb_all();
241 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) 240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
242 leave_mm(cpu); 241 leave_mm(cpu);
243} 242}
244 243
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index de6f1bda0c50..f8be6f1d2e48 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -154,7 +154,7 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
154out: 154out:
155 ack_APIC_irq(); 155 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask); 156 cpu_clear(cpu, f->flush_cpumask);
157 add_pda(irq_tlb_count, 1); 157 inc_irq_stat(irq_tlb_count);
158} 158}
159 159
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, 160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 04431f34fd16..6a00e5faaa74 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -566,14 +566,10 @@ static int __init uv_ptc_init(void)
566 if (!is_uv_system()) 566 if (!is_uv_system())
567 return 0; 567 return 0;
568 568
569 if (!proc_mkdir("sgi_uv", NULL))
570 return -EINVAL;
571
572 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL); 569 proc_uv_ptc = create_proc_entry(UV_PTC_BASENAME, 0444, NULL);
573 if (!proc_uv_ptc) { 570 if (!proc_uv_ptc) {
574 printk(KERN_ERR "unable to create %s proc entry\n", 571 printk(KERN_ERR "unable to create %s proc entry\n",
575 UV_PTC_BASENAME); 572 UV_PTC_BASENAME);
576 remove_proc_entry("sgi_uv", NULL);
577 return -EINVAL; 573 return -EINVAL;
578 } 574 }
579 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations; 575 proc_uv_ptc->proc_fops = &proc_uv_ptc_operations;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 1106fac6024d..808031a5ba19 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,10 +1,26 @@
1#include <linux/io.h> 1#include <linux/io.h>
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h>
4 5
5/* ready for x86_64 and x86 */ 6/* ready for x86_64 and x86 */
6unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
7 8
9void __init reserve_trampoline_memory(void)
10{
11#ifdef CONFIG_X86_32
12 /*
13 * But first pinch a few for the stack/trampoline stuff
14 * FIXME: Don't need the extra page at 4K, but need to fix
15 * trampoline before removing it. (see the GDT stuff)
16 */
17 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
18#endif
19 /* Has to be in very low memory so we can execute real-mode AP code. */
20 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE,
21 "TRAMPOLINE");
22}
23
8/* 24/*
9 * Currently trivial. Write the real->protected mode 25 * Currently trivial. Write the real->protected mode
10 * bootstrap into the page concerned. The caller 26 * bootstrap into the page concerned. The caller
@@ -12,7 +28,6 @@ unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
12 */ 28 */
13unsigned long setup_trampoline(void) 29unsigned long setup_trampoline(void)
14{ 30{
15 memcpy(trampoline_base, trampoline_data, 31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
16 trampoline_end - trampoline_data);
17 return virt_to_phys(trampoline_base); 32 return virt_to_phys(trampoline_base);
18} 33}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4a6dff39a470..2d1f4c7e4052 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -481,11 +481,7 @@ do_nmi(struct pt_regs *regs, long error_code)
481{ 481{
482 nmi_enter(); 482 nmi_enter();
483 483
484#ifdef CONFIG_X86_32 484 inc_irq_stat(__nmi_count);
485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
486#else
487 add_pda(__nmi_count, 1);
488#endif
489 485
490 if (!ignore_nmis) 486 if (!ignore_nmis)
491 default_do_nmi(regs); 487 default_do_nmi(regs);
@@ -664,7 +660,7 @@ void math_error(void __user *ip)
664{ 660{
665 struct task_struct *task; 661 struct task_struct *task;
666 siginfo_t info; 662 siginfo_t info;
667 unsigned short cwd, swd; 663 unsigned short cwd, swd, err;
668 664
669 /* 665 /*
670 * Save the info for the exception handler and clear the error. 666 * Save the info for the exception handler and clear the error.
@@ -675,7 +671,6 @@ void math_error(void __user *ip)
675 task->thread.error_code = 0; 671 task->thread.error_code = 0;
676 info.si_signo = SIGFPE; 672 info.si_signo = SIGFPE;
677 info.si_errno = 0; 673 info.si_errno = 0;
678 info.si_code = __SI_FAULT;
679 info.si_addr = ip; 674 info.si_addr = ip;
680 /* 675 /*
681 * (~cwd & swd) will mask out exceptions that are not set to unmasked 676 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -689,34 +684,31 @@ void math_error(void __user *ip)
689 */ 684 */
690 cwd = get_fpu_cwd(task); 685 cwd = get_fpu_cwd(task);
691 swd = get_fpu_swd(task); 686 swd = get_fpu_swd(task);
692 switch (swd & ~cwd & 0x3f) { 687
693 case 0x000: /* No unmasked exception */ 688 err = swd & ~cwd & 0x3f;
689
694#ifdef CONFIG_X86_32 690#ifdef CONFIG_X86_32
691 if (!err)
695 return; 692 return;
696#endif 693#endif
697 default: /* Multiple exceptions */ 694
698 break; 695 if (err & 0x001) { /* Invalid op */
699 case 0x001: /* Invalid Op */
700 /* 696 /*
701 * swd & 0x240 == 0x040: Stack Underflow 697 * swd & 0x240 == 0x040: Stack Underflow
702 * swd & 0x240 == 0x240: Stack Overflow 698 * swd & 0x240 == 0x240: Stack Overflow
703 * User must clear the SF bit (0x40) if set 699 * User must clear the SF bit (0x40) if set
704 */ 700 */
705 info.si_code = FPE_FLTINV; 701 info.si_code = FPE_FLTINV;
706 break; 702 } else if (err & 0x004) { /* Divide by Zero */
707 case 0x002: /* Denormalize */
708 case 0x010: /* Underflow */
709 info.si_code = FPE_FLTUND;
710 break;
711 case 0x004: /* Zero Divide */
712 info.si_code = FPE_FLTDIV; 703 info.si_code = FPE_FLTDIV;
713 break; 704 } else if (err & 0x008) { /* Overflow */
714 case 0x008: /* Overflow */
715 info.si_code = FPE_FLTOVF; 705 info.si_code = FPE_FLTOVF;
716 break; 706 } else if (err & 0x012) { /* Denormal, Underflow */
717 case 0x020: /* Precision */ 707 info.si_code = FPE_FLTUND;
708 } else if (err & 0x020) { /* Precision */
718 info.si_code = FPE_FLTRES; 709 info.si_code = FPE_FLTRES;
719 break; 710 } else {
711 info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
720 } 712 }
721 force_sig_info(SIGFPE, &info, task); 713 force_sig_info(SIGFPE, &info, task);
722} 714}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 424093b157d3..599e58168631 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -15,6 +15,7 @@
15#include <asm/vgtod.h> 15#include <asm/vgtod.h>
16#include <asm/time.h> 16#include <asm/time.h>
17#include <asm/delay.h> 17#include <asm/delay.h>
18#include <asm/hypervisor.h>
18 19
19unsigned int cpu_khz; /* TSC clocks / usec, not used here */ 20unsigned int cpu_khz; /* TSC clocks / usec, not used here */
20EXPORT_SYMBOL(cpu_khz); 21EXPORT_SYMBOL(cpu_khz);
@@ -31,6 +32,7 @@ static int tsc_unstable;
31 erroneous rdtsc usage on !cpu_has_tsc processors */ 32 erroneous rdtsc usage on !cpu_has_tsc processors */
32static int tsc_disabled = -1; 33static int tsc_disabled = -1;
33 34
35static int tsc_clocksource_reliable;
34/* 36/*
35 * Scheduler clock - returns current time in nanosec units. 37 * Scheduler clock - returns current time in nanosec units.
36 */ 38 */
@@ -98,6 +100,15 @@ int __init notsc_setup(char *str)
98 100
99__setup("notsc", notsc_setup); 101__setup("notsc", notsc_setup);
100 102
103static int __init tsc_setup(char *str)
104{
105 if (!strcmp(str, "reliable"))
106 tsc_clocksource_reliable = 1;
107 return 1;
108}
109
110__setup("tsc=", tsc_setup);
111
101#define MAX_RETRIES 5 112#define MAX_RETRIES 5
102#define SMI_TRESHOLD 50000 113#define SMI_TRESHOLD 50000
103 114
@@ -352,9 +363,15 @@ unsigned long native_calibrate_tsc(void)
352{ 363{
353 u64 tsc1, tsc2, delta, ref1, ref2; 364 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 365 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate; 366 unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
356 int hpet = is_hpet_enabled(), i, loopmin; 367 int hpet = is_hpet_enabled(), i, loopmin;
357 368
369 tsc_khz = get_hypervisor_tsc_freq();
370 if (tsc_khz) {
371 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
372 return tsc_khz;
373 }
374
358 local_irq_save(flags); 375 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate(); 376 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags); 377 local_irq_restore(flags);
@@ -731,24 +748,21 @@ static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
731 {} 748 {}
732}; 749};
733 750
734/* 751static void __init check_system_tsc_reliable(void)
735 * Geode_LX - the OLPC CPU has a possibly a very reliable TSC 752{
736 */
737#ifdef CONFIG_MGEODE_LX 753#ifdef CONFIG_MGEODE_LX
738/* RTSC counts during suspend */ 754 /* RTSC counts during suspend */
739#define RTSC_SUSP 0x100 755#define RTSC_SUSP 0x100
740
741static void __init check_geode_tsc_reliable(void)
742{
743 unsigned long res_low, res_high; 756 unsigned long res_low, res_high;
744 757
745 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 758 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
759 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */
746 if (res_low & RTSC_SUSP) 760 if (res_low & RTSC_SUSP)
747 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 761 tsc_clocksource_reliable = 1;
748}
749#else
750static inline void check_geode_tsc_reliable(void) { }
751#endif 762#endif
763 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
764 tsc_clocksource_reliable = 1;
765}
752 766
753/* 767/*
754 * Make an educated guess if the TSC is trustworthy and synchronized 768 * Make an educated guess if the TSC is trustworthy and synchronized
@@ -783,6 +797,8 @@ static void __init init_tsc_clocksource(void)
783{ 797{
784 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, 798 clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
785 clocksource_tsc.shift); 799 clocksource_tsc.shift);
800 if (tsc_clocksource_reliable)
801 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
786 /* lower the rating if we already know its unstable: */ 802 /* lower the rating if we already know its unstable: */
787 if (check_tsc_unstable()) { 803 if (check_tsc_unstable()) {
788 clocksource_tsc.rating = 0; 804 clocksource_tsc.rating = 0;
@@ -843,7 +859,7 @@ void __init tsc_init(void)
843 if (unsynchronized_tsc()) 859 if (unsynchronized_tsc())
844 mark_tsc_unstable("TSCs unsynchronized"); 860 mark_tsc_unstable("TSCs unsynchronized");
845 861
846 check_geode_tsc_reliable(); 862 check_system_tsc_reliable();
847 init_tsc_clocksource(); 863 init_tsc_clocksource();
848} 864}
849 865
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 1c0dfbca87c1..bf36328f6ef9 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -112,6 +112,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
112 if (unsynchronized_tsc()) 112 if (unsynchronized_tsc())
113 return; 113 return;
114 114
115 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
116 printk(KERN_INFO
117 "Skipping synchronization checks as TSC is reliable.\n");
118 return;
119 }
120
115 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", 121 printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
116 smp_processor_id(), cpu); 122 smp_processor_id(), cpu);
117 123
@@ -165,7 +171,7 @@ void __cpuinit check_tsc_sync_target(void)
165{ 171{
166 int cpus = 2; 172 int cpus = 2;
167 173
168 if (unsynchronized_tsc()) 174 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
169 return; 175 return;
170 176
171 /* 177 /*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 22fd6577156a..23206ba16874 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -266,109 +266,6 @@ static void vmi_nop(void)
266{ 266{
267} 267}
268 268
269#ifdef CONFIG_DEBUG_PAGE_TYPE
270
271#ifdef CONFIG_X86_PAE
272#define MAX_BOOT_PTS (2048+4+1)
273#else
274#define MAX_BOOT_PTS (1024+1)
275#endif
276
277/*
278 * During boot, mem_map is not yet available in paging_init, so stash
279 * all the boot page allocations here.
280 */
281static struct {
282 u32 pfn;
283 int type;
284} boot_page_allocations[MAX_BOOT_PTS];
285static int num_boot_page_allocations;
286static int boot_allocations_applied;
287
288void vmi_apply_boot_page_allocations(void)
289{
290 int i;
291 BUG_ON(!mem_map);
292 for (i = 0; i < num_boot_page_allocations; i++) {
293 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
294 page->type = boot_page_allocations[i].type;
295 page->type = boot_page_allocations[i].type &
296 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
297 }
298 boot_allocations_applied = 1;
299}
300
301static void record_page_type(u32 pfn, int type)
302{
303 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
304 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
305 boot_page_allocations[num_boot_page_allocations].type = type;
306 num_boot_page_allocations++;
307}
308
309static void check_zeroed_page(u32 pfn, int type, struct page *page)
310{
311 u32 *ptr;
312 int i;
313 int limit = PAGE_SIZE / sizeof(int);
314
315 if (page_address(page))
316 ptr = (u32 *)page_address(page);
317 else
318 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
319 /*
320 * When cloning the root in non-PAE mode, only the userspace
321 * pdes need to be zeroed.
322 */
323 if (type & VMI_PAGE_CLONE)
324 limit = KERNEL_PGD_BOUNDARY;
325 for (i = 0; i < limit; i++)
326 BUG_ON(ptr[i]);
327}
328
329/*
330 * We stash the page type into struct page so we can verify the page
331 * types are used properly.
332 */
333static void vmi_set_page_type(u32 pfn, int type)
334{
335 /* PAE can have multiple roots per page - don't track */
336 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
337 return;
338
339 if (boot_allocations_applied) {
340 struct page *page = pfn_to_page(pfn);
341 if (type != VMI_PAGE_NORMAL)
342 BUG_ON(page->type);
343 else
344 BUG_ON(page->type == VMI_PAGE_NORMAL);
345 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
346 if (type & VMI_PAGE_ZEROED)
347 check_zeroed_page(pfn, type, page);
348 } else {
349 record_page_type(pfn, type);
350 }
351}
352
353static void vmi_check_page_type(u32 pfn, int type)
354{
355 /* PAE can have multiple roots per page - skip checks */
356 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
357 return;
358
359 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
360 if (boot_allocations_applied) {
361 struct page *page = pfn_to_page(pfn);
362 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
363 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
364 BUG_ON((type & page->type) == 0);
365 }
366}
367#else
368#define vmi_set_page_type(p,t) do { } while (0)
369#define vmi_check_page_type(p,t) do { } while (0)
370#endif
371
372#ifdef CONFIG_HIGHPTE 269#ifdef CONFIG_HIGHPTE
373static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) 270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
374{ 271{
@@ -395,7 +292,6 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
395 292
396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 294{
398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 296}
401 297
@@ -406,27 +302,22 @@ static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
406 * It is called only for swapper_pg_dir, which already has 302 * It is called only for swapper_pg_dir, which already has
407 * data on it. 303 * data on it.
408 */ 304 */
409 vmi_set_page_type(pfn, VMI_PAGE_L2);
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 305 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 306}
412 307
413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) 308static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 309{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 310 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 311}
419 312
420static void vmi_release_pte(unsigned long pfn) 313static void vmi_release_pte(unsigned long pfn)
421{ 314{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 315 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 316}
425 317
426static void vmi_release_pmd(unsigned long pfn) 318static void vmi_release_pmd(unsigned long pfn)
427{ 319{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 320 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
430} 321}
431 322
432/* 323/*
@@ -450,26 +341,22 @@ static void vmi_release_pmd(unsigned long pfn)
450 341
451static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 342static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
452{ 343{
453 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
454 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 344 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
455} 345}
456 346
457static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 347static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
458{ 348{
459 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
460 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0)); 349 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
461} 350}
462 351
463static void vmi_set_pte(pte_t *ptep, pte_t pte) 352static void vmi_set_pte(pte_t *ptep, pte_t pte)
464{ 353{
465 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */ 354 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
466 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
467 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT); 355 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
468} 356}
469 357
470static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 358static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
471{ 359{
472 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
473 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 360 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
474} 361}
475 362
@@ -477,10 +364,8 @@ static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
477{ 364{
478#ifdef CONFIG_X86_PAE 365#ifdef CONFIG_X86_PAE
479 const pte_t pte = { .pte = pmdval.pmd }; 366 const pte_t pte = { .pte = pmdval.pmd };
480 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
481#else 367#else
482 const pte_t pte = { pmdval.pud.pgd.pgd }; 368 const pte_t pte = { pmdval.pud.pgd.pgd };
483 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
484#endif 369#endif
485 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD); 370 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
486} 371}
@@ -502,7 +387,6 @@ static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
502 387
503static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 388static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
504{ 389{
505 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
506 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1)); 390 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
507} 391}
508 392
@@ -510,21 +394,18 @@ static void vmi_set_pud(pud_t *pudp, pud_t pudval)
510{ 394{
511 /* Um, eww */ 395 /* Um, eww */
512 const pte_t pte = { .pte = pudval.pgd.pgd }; 396 const pte_t pte = { .pte = pudval.pgd.pgd };
513 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
514 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); 397 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
515} 398}
516 399
517static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 400static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
518{ 401{
519 const pte_t pte = { .pte = 0 }; 402 const pte_t pte = { .pte = 0 };
520 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
521 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 403 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
522} 404}
523 405
524static void vmi_pmd_clear(pmd_t *pmd) 406static void vmi_pmd_clear(pmd_t *pmd)
525{ 407{
526 const pte_t pte = { .pte = 0 }; 408 const pte_t pte = { .pte = 0 };
527 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
528 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); 409 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
529} 410}
530#endif 411#endif
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index a9b8560adbc2..82c67559dde7 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -44,6 +44,7 @@ SECTIONS
44 SCHED_TEXT 44 SCHED_TEXT
45 LOCK_TEXT 45 LOCK_TEXT
46 KPROBES_TEXT 46 KPROBES_TEXT
47 IRQENTRY_TEXT
47 *(.fixup) 48 *(.fixup)
48 *(.gnu.warning) 49 *(.gnu.warning)
49 _etext = .; /* End of text section */ 50 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 46e05447405b..1a614c0e6bef 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -35,6 +35,7 @@ SECTIONS
35 SCHED_TEXT 35 SCHED_TEXT
36 LOCK_TEXT 36 LOCK_TEXT
37 KPROBES_TEXT 37 KPROBES_TEXT
38 IRQENTRY_TEXT
38 *(.fixup) 39 *(.fixup)
39 *(.gnu.warning) 40 *(.gnu.warning)
40 _etext = .; /* End of text section */ 41 _etext = .; /* End of text section */
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 6f3d3d4cd973..44153afc9067 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -131,7 +131,16 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
131 gettimeofday(tv,NULL); 131 gettimeofday(tv,NULL);
132 return; 132 return;
133 } 133 }
134
135 /*
136 * Surround the RDTSC by barriers, to make sure it's not
137 * speculated to outside the seqlock critical section and
138 * does not cause time warps:
139 */
140 rdtsc_barrier();
134 now = vread(); 141 now = vread();
142 rdtsc_barrier();
143
135 base = __vsyscall_gtod_data.clock.cycle_last; 144 base = __vsyscall_gtod_data.clock.cycle_last;
136 mask = __vsyscall_gtod_data.clock.mask; 145 mask = __vsyscall_gtod_data.clock.mask;
137 mult = __vsyscall_gtod_data.clock.mult; 146 mult = __vsyscall_gtod_data.clock.mult;