aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile98
-rw-r--r--arch/x86/kernel/Makefile_3288
-rw-r--r--arch/x86/kernel/Makefile_6445
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c48
-rw-r--r--arch/x86/kernel/acpi/sleep.c87
-rw-r--r--arch/x86/kernel/acpi/sleep_32.c70
-rw-r--r--arch/x86/kernel/acpi/sleep_64.c117
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S32
-rw-r--r--arch/x86/kernel/alternative.c40
-rw-r--r--arch/x86/kernel/aperture_64.c374
-rw-r--r--arch/x86/kernel/apic_32.c158
-rw-r--r--arch/x86/kernel/apic_64.c1259
-rw-r--r--arch/x86/kernel/apm_32.c389
-rw-r--r--arch/x86/kernel/asm-offsets_32.c65
-rw-r--r--arch/x86/kernel/asm-offsets_64.c56
-rw-r--r--arch/x86/kernel/bootflag.c50
-rw-r--r--arch/x86/kernel/bugs_64.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/bugs.c5
-rw-r--r--arch/x86/kernel/cpu/common.c214
-rw-r--r--arch/x86/kernel/cpu/cpu.h14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c25
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c13
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c5
-rw-r--r--arch/x86/kernel/cpu/cyrix.c8
-rw-r--r--arch/x86/kernel/cpu/feature_names.c83
-rw-r--r--arch/x86/kernel/cpu/intel.c40
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c30
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c47
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c49
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c35
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c110
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c27
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c23
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c170
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h13
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c3
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c1
-rw-r--r--arch/x86/kernel/cpu/proc.c76
-rw-r--r--arch/x86/kernel/cpuid.c60
-rw-r--r--arch/x86/kernel/doublefault_32.c19
-rw-r--r--arch/x86/kernel/ds.c464
-rw-r--r--arch/x86/kernel/e820_32.c241
-rw-r--r--arch/x86/kernel/e820_64.c437
-rw-r--r--arch/x86/kernel/early-quirks.c127
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/efi.c515
-rw-r--r--arch/x86/kernel/efi_32.c618
-rw-r--r--arch/x86/kernel/efi_64.c134
-rw-r--r--arch/x86/kernel/efi_stub_64.S109
-rw-r--r--arch/x86/kernel/entry_32.S26
-rw-r--r--arch/x86/kernel/entry_64.S127
-rw-r--r--arch/x86/kernel/genapic_64.c15
-rw-r--r--arch/x86/kernel/geode_32.c48
-rw-r--r--arch/x86/kernel/head64.c63
-rw-r--r--arch/x86/kernel/head_32.S17
-rw-r--r--arch/x86/kernel/head_64.S67
-rw-r--r--arch/x86/kernel/hpet.c62
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c7
-rw-r--r--arch/x86/kernel/i387.c479
-rw-r--r--arch/x86/kernel/i387_32.c544
-rw-r--r--arch/x86/kernel/i387_64.c150
-rw-r--r--arch/x86/kernel/i8237.c2
-rw-r--r--arch/x86/kernel/i8253.c72
-rw-r--r--arch/x86/kernel/i8259_32.c26
-rw-r--r--arch/x86/kernel/i8259_64.c162
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic_32.c27
-rw-r--r--arch/x86/kernel/io_apic_64.c114
-rw-r--r--arch/x86/kernel/io_delay.c114
-rw-r--r--arch/x86/kernel/ioport.c (renamed from arch/x86/kernel/ioport_32.c)85
-rw-r--r--arch/x86/kernel/ioport_64.c117
-rw-r--r--arch/x86/kernel/irq_32.c22
-rw-r--r--arch/x86/kernel/irq_64.c30
-rw-r--r--arch/x86/kernel/kdebugfs.c65
-rw-r--r--arch/x86/kernel/kprobes.c1066
-rw-r--r--arch/x86/kernel/kprobes_32.c756
-rw-r--r--arch/x86/kernel/kprobes_64.c749
-rw-r--r--arch/x86/kernel/ldt.c (renamed from arch/x86/kernel/ldt_32.c)113
-rw-r--r--arch/x86/kernel/ldt_64.c250
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c5
-rw-r--r--arch/x86/kernel/mfgpt_32.c30
-rw-r--r--arch/x86/kernel/microcode.c30
-rw-r--r--arch/x86/kernel/mpparse_32.c39
-rw-r--r--arch/x86/kernel/mpparse_64.c28
-rw-r--r--arch/x86/kernel/msr.c22
-rw-r--r--arch/x86/kernel/nmi_32.c25
-rw-r--r--arch/x86/kernel/nmi_64.c101
-rw-r--r--arch/x86/kernel/numaq_32.c2
-rw-r--r--arch/x86/kernel/paravirt.c (renamed from arch/x86/kernel/paravirt_32.c)96
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c49
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c57
-rw-r--r--arch/x86/kernel/pci-calgary_64.c43
-rw-r--r--arch/x86/kernel/pci-dma_64.c3
-rw-r--r--arch/x86/kernel/pci-gart_64.c549
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c1
-rw-r--r--arch/x86/kernel/pmtimer_64.c4
-rw-r--r--arch/x86/kernel/process_32.c430
-rw-r--r--arch/x86/kernel/process_64.c331
-rw-r--r--arch/x86/kernel/ptrace.c1566
-rw-r--r--arch/x86/kernel/ptrace_32.c717
-rw-r--r--arch/x86/kernel/ptrace_64.c621
-rw-r--r--arch/x86/kernel/quirks.c71
-rw-r--r--arch/x86/kernel/reboot.c (renamed from arch/x86/kernel/reboot_32.c)284
-rw-r--r--arch/x86/kernel/reboot_64.c176
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c14
-rw-r--r--arch/x86/kernel/rtc.c204
-rw-r--r--arch/x86/kernel/scx200_32.c2
-rw-r--r--arch/x86/kernel/setup64.c59
-rw-r--r--arch/x86/kernel/setup_32.c285
-rw-r--r--arch/x86/kernel/setup_64.c624
-rw-r--r--arch/x86/kernel/signal_32.c228
-rw-r--r--arch/x86/kernel/signal_64.c136
-rw-r--r--arch/x86/kernel/smp_32.c15
-rw-r--r--arch/x86/kernel/smp_64.c91
-rw-r--r--arch/x86/kernel/smpboot_32.c63
-rw-r--r--arch/x86/kernel/smpboot_64.c81
-rw-r--r--arch/x86/kernel/smpcommon_32.c7
-rw-r--r--arch/x86/kernel/srat_32.c10
-rw-r--r--arch/x86/kernel/stacktrace.c33
-rw-r--r--arch/x86/kernel/step.c203
-rw-r--r--arch/x86/kernel/suspend_64.c38
-rw-r--r--arch/x86/kernel/suspend_asm_64.S32
-rw-r--r--arch/x86/kernel/sys_x86_64.c98
-rw-r--r--arch/x86/kernel/syscall_table_32.S4
-rw-r--r--arch/x86/kernel/sysenter_32.c346
-rw-r--r--arch/x86/kernel/test_nx.c173
-rw-r--r--arch/x86/kernel/test_rodata.c86
-rw-r--r--arch/x86/kernel/time_32.c114
-rw-r--r--arch/x86/kernel/time_64.c187
-rw-r--r--arch/x86/kernel/tls.c213
-rw-r--r--arch/x86/kernel/tls.h21
-rw-r--r--arch/x86/kernel/topology.c22
-rw-r--r--arch/x86/kernel/trampoline_32.S7
-rw-r--r--arch/x86/kernel/trampoline_64.S3
-rw-r--r--arch/x86/kernel/traps_32.c357
-rw-r--r--arch/x86/kernel/traps_64.c368
-rw-r--r--arch/x86/kernel/tsc_32.c62
-rw-r--r--arch/x86/kernel/tsc_64.c100
-rw-r--r--arch/x86/kernel/tsc_sync.c30
-rw-r--r--arch/x86/kernel/vm86_32.c115
-rw-r--r--arch/x86/kernel/vmi_32.c126
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S22
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S42
-rw-r--r--arch/x86/kernel/vsmp_64.c11
-rw-r--r--arch/x86/kernel/vsyscall-int80_32.S53
-rw-r--r--arch/x86/kernel/vsyscall-note_32.S45
-rw-r--r--arch/x86/kernel/vsyscall-sigreturn_32.S143
-rw-r--r--arch/x86/kernel/vsyscall-sysenter_32.S122
-rw-r--r--arch/x86/kernel/vsyscall_32.S15
-rw-r--r--arch/x86/kernel/vsyscall_32.lds.S67
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c13
167 files changed, 11607 insertions, 10943 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 38573340b143..21dc1a061bf1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -1,9 +1,93 @@
1ifeq ($(CONFIG_X86_32),y) 1#
2include ${srctree}/arch/x86/kernel/Makefile_32 2# Makefile for the linux kernel.
3else 3#
4include ${srctree}/arch/x86/kernel/Makefile_64 4
5extra-y := head_$(BITS).o init_task.o vmlinux.lds
6extra-$(CONFIG_X86_64) += head64.o
7
8CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
9CFLAGS_vsyscall_64.o := $(PROFILING) -g0
10
11obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
12obj-y += traps_$(BITS).o irq_$(BITS).o
13obj-y += time_$(BITS).o ioport.o ldt.o
14obj-y += setup_$(BITS).o i8259_$(BITS).o
15obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
16obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
17obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o
18obj-y += pci-dma_$(BITS).o bootflag.o e820_$(BITS).o
19obj-y += quirks.o i8237.o topology.o kdebugfs.o
20obj-y += alternative.o i8253.o
21obj-$(CONFIG_X86_64) += pci-nommu_64.o bugs_64.o
22obj-y += tsc_$(BITS).o io_delay.o rtc.o
23
24obj-y += i387.o
25obj-y += ptrace.o
26obj-y += ds.o
27obj-$(CONFIG_X86_32) += tls.o
28obj-$(CONFIG_IA32_EMULATION) += tls.o
29obj-y += step.o
30obj-$(CONFIG_STACKTRACE) += stacktrace.o
31obj-y += cpu/
32obj-y += acpi/
33obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
34obj-$(CONFIG_X86_64) += reboot.o
35obj-$(CONFIG_MCA) += mca_32.o
36obj-$(CONFIG_X86_MSR) += msr.o
37obj-$(CONFIG_X86_CPUID) += cpuid.o
38obj-$(CONFIG_MICROCODE) += microcode.o
39obj-$(CONFIG_PCI) += early-quirks.o
40apm-y := apm_32.o
41obj-$(CONFIG_APM) += apm.o
42obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o
43obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o
44obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o
45obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
46obj-$(CONFIG_X86_MPPARSE) += mpparse_$(BITS).o
47obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o
48obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
49obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
50obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
51obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
52obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
53obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
54obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
55obj-$(CONFIG_X86_VSMP) += vsmp_64.o
56obj-$(CONFIG_KPROBES) += kprobes.o
57obj-$(CONFIG_MODULES) += module_$(BITS).o
58obj-$(CONFIG_ACPI_SRAT) += srat_32.o
59obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
60obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
61obj-$(CONFIG_VM86) += vm86_32.o
62obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
63
64obj-$(CONFIG_HPET_TIMER) += hpet.o
65
66obj-$(CONFIG_K8_NB) += k8.o
67obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
68obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
69obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
70
71obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
72obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
73
74ifdef CONFIG_INPUT_PCSPKR
75obj-y += pcspeaker.o
5endif 76endif
6 77
7# Workaround to delete .lds files with make clean 78obj-$(CONFIG_SCx200) += scx200.o
8# The problem is that we do not enter Makefile_32 with make clean. 79scx200-y += scx200_32.o
9clean-files := vsyscall*.lds vsyscall*.so 80
81###
82# 64 bit specific files
83ifeq ($(CONFIG_X86_64),y)
84 obj-y += genapic_64.o genapic_flat_64.o
85 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
86 obj-$(CONFIG_AUDIT) += audit_64.o
87 obj-$(CONFIG_PM) += suspend_64.o
88 obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
89
90 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
91 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
92 obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
93endif
diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32
deleted file mode 100644
index a7bc93c27662..000000000000
--- a/arch/x86/kernel/Makefile_32
+++ /dev/null
@@ -1,88 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_32.o init_task.o vmlinux.lds
6CPPFLAGS_vmlinux.lds += -Ui386
7
8obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \
9 ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \
10 pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\
11 quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o
12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += cpu/
15obj-y += acpi/
16obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o
17obj-$(CONFIG_MCA) += mca_32.o
18obj-$(CONFIG_X86_MSR) += msr.o
19obj-$(CONFIG_X86_CPUID) += cpuid.o
20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_PCI) += early-quirks.o
22obj-$(CONFIG_APM) += apm_32.o
23obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o
24obj-$(CONFIG_SMP) += smpcommon_32.o
25obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o
26obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o
27obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o
28obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o
29obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
30obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash.o
31obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o
32obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
33obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
34obj-$(CONFIG_KPROBES) += kprobes_32.o
35obj-$(CONFIG_MODULES) += module_32.o
36obj-y += sysenter_32.o vsyscall_32.o
37obj-$(CONFIG_ACPI_SRAT) += srat_32.o
38obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o
39obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
40obj-$(CONFIG_VM86) += vm86_32.o
41obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
42obj-$(CONFIG_HPET_TIMER) += hpet.o
43obj-$(CONFIG_K8_NB) += k8.o
44obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
45
46obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
47obj-$(CONFIG_PARAVIRT) += paravirt_32.o
48obj-y += pcspeaker.o
49
50obj-$(CONFIG_SCx200) += scx200_32.o
51
52# vsyscall_32.o contains the vsyscall DSO images as __initdata.
53# We must build both images before we can assemble it.
54# Note: kbuild does not track this dependency due to usage of .incbin
55$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so
56targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so)
57targets += vsyscall-note_32.o vsyscall_32.lds
58
59# The DSO images are built using a special linker script.
60quiet_cmd_syscall = SYSCALL $@
61 cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
62 -Wl,-T,$(filter-out FORCE,$^) -o $@
63
64export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386
65
66vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
67 $(call ld-option, -Wl$(comma)--hash-style=sysv)
68SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags)
69SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags)
70
71$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \
72$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \
73 $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE
74 $(call if_changed,syscall)
75
76# We also create a special relocatable object that should mirror the symbol
77# table and layout of the linked DSO. With ld -R we can then refer to
78# these symbols in the kernel code rather than hand-coded addresses.
79extra-y += vsyscall-syms.o
80$(obj)/built-in.o: $(obj)/vsyscall-syms.o
81$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
82
83SYSCFLAGS_vsyscall-syms.o = -r
84$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \
85 $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE
86 $(call if_changed,syscall)
87
88
diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64
deleted file mode 100644
index 5a88890d8ee9..000000000000
--- a/arch/x86/kernel/Makefile_64
+++ /dev/null
@@ -1,45 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5extra-y := head_64.o head64.o init_task.o vmlinux.lds
6CPPFLAGS_vmlinux.lds += -Ux86_64
7EXTRA_AFLAGS := -traditional
8
9obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \
10 ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \
11 x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \
12 setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \
13 pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \
14 i8253.o
15
16obj-$(CONFIG_STACKTRACE) += stacktrace.o
17obj-y += cpu/
18obj-y += acpi/
19obj-$(CONFIG_X86_MSR) += msr.o
20obj-$(CONFIG_MICROCODE) += microcode.o
21obj-$(CONFIG_X86_CPUID) += cpuid.o
22obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o
23obj-y += apic_64.o nmi_64.o
24obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o
25obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash.o
26obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o
27obj-$(CONFIG_PM) += suspend_64.o
28obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o
29obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
30obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
31obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
32obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
33obj-$(CONFIG_KPROBES) += kprobes_64.o
34obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
35obj-$(CONFIG_X86_VSMP) += vsmp_64.o
36obj-$(CONFIG_K8_NB) += k8.o
37obj-$(CONFIG_AUDIT) += audit_64.o
38
39obj-$(CONFIG_MODULES) += module_64.o
40obj-$(CONFIG_PCI) += early-quirks.o
41
42obj-y += topology.o
43obj-y += pcspeaker.o
44
45CFLAGS_vsyscall_64.o := $(PROFILING) -g0
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 1351c3982ee4..19d3d6e9d09b 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,5 @@
1obj-$(CONFIG_ACPI) += boot.o 1obj-$(CONFIG_ACPI) += boot.o
2obj-$(CONFIG_ACPI_SLEEP) += sleep_$(BITS).o wakeup_$(BITS).o 2obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
3 3
4ifneq ($(CONFIG_ACPI_PROCESSOR),) 4ifneq ($(CONFIG_ACPI_PROCESSOR),)
5obj-y += cstate.o processor.o 5obj-y += cstate.o processor.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 0ca27c7b0e8d..fc8825d4b996 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -496,7 +496,8 @@ EXPORT_SYMBOL(acpi_register_gsi);
496 * ACPI based hotplug support for CPU 496 * ACPI based hotplug support for CPU
497 */ 497 */
498#ifdef CONFIG_ACPI_HOTPLUG_CPU 498#ifdef CONFIG_ACPI_HOTPLUG_CPU
499int acpi_map_lsapic(acpi_handle handle, int *pcpu) 499
500static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
500{ 501{
501 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 502 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
502 union acpi_object *obj; 503 union acpi_object *obj;
@@ -551,6 +552,11 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
551 return 0; 552 return 0;
552} 553}
553 554
555/* wrapper to silence section mismatch warning */
556int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
557{
558 return _acpi_map_lsapic(handle, pcpu);
559}
554EXPORT_SYMBOL(acpi_map_lsapic); 560EXPORT_SYMBOL(acpi_map_lsapic);
555 561
556int acpi_unmap_lsapic(int cpu) 562int acpi_unmap_lsapic(int cpu)
@@ -581,25 +587,6 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
581 587
582EXPORT_SYMBOL(acpi_unregister_ioapic); 588EXPORT_SYMBOL(acpi_unregister_ioapic);
583 589
584static unsigned long __init
585acpi_scan_rsdp(unsigned long start, unsigned long length)
586{
587 unsigned long offset = 0;
588 unsigned long sig_len = sizeof("RSD PTR ") - 1;
589
590 /*
591 * Scan all 16-byte boundaries of the physical memory region for the
592 * RSDP signature.
593 */
594 for (offset = 0; offset < length; offset += 16) {
595 if (strncmp((char *)(phys_to_virt(start) + offset), "RSD PTR ", sig_len))
596 continue;
597 return (start + offset);
598 }
599
600 return 0;
601}
602
603static int __init acpi_parse_sbf(struct acpi_table_header *table) 590static int __init acpi_parse_sbf(struct acpi_table_header *table)
604{ 591{
605 struct acpi_table_boot *sb; 592 struct acpi_table_boot *sb;
@@ -742,27 +729,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
742 return 0; 729 return 0;
743} 730}
744 731
745unsigned long __init acpi_find_rsdp(void)
746{
747 unsigned long rsdp_phys = 0;
748
749 if (efi_enabled) {
750 if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
751 return efi.acpi20;
752 else if (efi.acpi != EFI_INVALID_TABLE_ADDR)
753 return efi.acpi;
754 }
755 /*
756 * Scan memory looking for the RSDP signature. First search EBDA (low
757 * memory) paragraphs and then search upper memory (E0000-FFFFF).
758 */
759 rsdp_phys = acpi_scan_rsdp(0, 0x400);
760 if (!rsdp_phys)
761 rsdp_phys = acpi_scan_rsdp(0xE0000, 0x20000);
762
763 return rsdp_phys;
764}
765
766#ifdef CONFIG_X86_LOCAL_APIC 732#ifdef CONFIG_X86_LOCAL_APIC
767/* 733/*
768 * Parse LAPIC entries in MADT 734 * Parse LAPIC entries in MADT
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..6bc815cd8cb3
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -0,0 +1,87 @@
1/*
2 * sleep.c - x86-specific ACPI sleep support.
3 *
4 * Copyright (C) 2001-2003 Patrick Mochel
5 * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
6 */
7
8#include <linux/acpi.h>
9#include <linux/bootmem.h>
10#include <linux/dmi.h>
11#include <linux/cpumask.h>
12
13#include <asm/smp.h>
14
15/* address in low memory of the wakeup routine. */
16unsigned long acpi_wakeup_address = 0;
17unsigned long acpi_realmode_flags;
18extern char wakeup_start, wakeup_end;
19
20extern unsigned long acpi_copy_wakeup_routine(unsigned long);
21
22/**
23 * acpi_save_state_mem - save kernel state
24 *
25 * Create an identity mapped page table and copy the wakeup routine to
26 * low memory.
27 */
28int acpi_save_state_mem(void)
29{
30 if (!acpi_wakeup_address) {
31 printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
32 return -ENOMEM;
33 }
34 memcpy((void *)acpi_wakeup_address, &wakeup_start,
35 &wakeup_end - &wakeup_start);
36 acpi_copy_wakeup_routine(acpi_wakeup_address);
37
38 return 0;
39}
40
41/*
42 * acpi_restore_state - undo effects of acpi_save_state_mem
43 */
44void acpi_restore_state_mem(void)
45{
46}
47
48
49/**
50 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
51 *
52 * We allocate a page from the first 1MB of memory for the wakeup
53 * routine for when we come back from a sleep state. The
54 * runtime allocator allows specification of <16MB pages, but not
55 * <1MB pages.
56 */
57void __init acpi_reserve_bootmem(void)
58{
59 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
60 printk(KERN_ERR
61 "ACPI: Wakeup code way too big, S3 disabled.\n");
62 return;
63 }
64
65 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
66 if (!acpi_wakeup_address)
67 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
68}
69
70
71static int __init acpi_sleep_setup(char *str)
72{
73 while ((str != NULL) && (*str != '\0')) {
74 if (strncmp(str, "s3_bios", 7) == 0)
75 acpi_realmode_flags |= 1;
76 if (strncmp(str, "s3_mode", 7) == 0)
77 acpi_realmode_flags |= 2;
78 if (strncmp(str, "s3_beep", 7) == 0)
79 acpi_realmode_flags |= 4;
80 str = strchr(str, ',');
81 if (str != NULL)
82 str += strspn(str, ", \t");
83 }
84 return 1;
85}
86
87__setup("acpi_sleep=", acpi_sleep_setup);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
index 10699489cfe7..63fe5525e026 100644
--- a/arch/x86/kernel/acpi/sleep_32.c
+++ b/arch/x86/kernel/acpi/sleep_32.c
@@ -12,76 +12,6 @@
12 12
13#include <asm/smp.h> 13#include <asm/smp.h>
14 14
15/* address in low memory of the wakeup routine. */
16unsigned long acpi_wakeup_address = 0;
17unsigned long acpi_realmode_flags;
18extern char wakeup_start, wakeup_end;
19
20extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
21
22/**
23 * acpi_save_state_mem - save kernel state
24 *
25 * Create an identity mapped page table and copy the wakeup routine to
26 * low memory.
27 */
28int acpi_save_state_mem(void)
29{
30 if (!acpi_wakeup_address)
31 return 1;
32 memcpy((void *)acpi_wakeup_address, &wakeup_start,
33 &wakeup_end - &wakeup_start);
34 acpi_copy_wakeup_routine(acpi_wakeup_address);
35
36 return 0;
37}
38
39/*
40 * acpi_restore_state - undo effects of acpi_save_state_mem
41 */
42void acpi_restore_state_mem(void)
43{
44}
45
46/**
47 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
48 *
49 * We allocate a page from the first 1MB of memory for the wakeup
50 * routine for when we come back from a sleep state. The
51 * runtime allocator allows specification of <16MB pages, but not
52 * <1MB pages.
53 */
54void __init acpi_reserve_bootmem(void)
55{
56 if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) {
57 printk(KERN_ERR
58 "ACPI: Wakeup code way too big, S3 disabled.\n");
59 return;
60 }
61
62 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
63 if (!acpi_wakeup_address)
64 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
65}
66
67static int __init acpi_sleep_setup(char *str)
68{
69 while ((str != NULL) && (*str != '\0')) {
70 if (strncmp(str, "s3_bios", 7) == 0)
71 acpi_realmode_flags |= 1;
72 if (strncmp(str, "s3_mode", 7) == 0)
73 acpi_realmode_flags |= 2;
74 if (strncmp(str, "s3_beep", 7) == 0)
75 acpi_realmode_flags |= 4;
76 str = strchr(str, ',');
77 if (str != NULL)
78 str += strspn(str, ", \t");
79 }
80 return 1;
81}
82
83__setup("acpi_sleep=", acpi_sleep_setup);
84
85/* Ouch, we want to delete this. We already have better version in userspace, in 15/* Ouch, we want to delete this. We already have better version in userspace, in
86 s2ram from suspend.sf.net project */ 16 s2ram from suspend.sf.net project */
87static __init int reset_videomode_after_s3(const struct dmi_system_id *d) 17static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c
deleted file mode 100644
index da42de261ba8..000000000000
--- a/arch/x86/kernel/acpi/sleep_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * acpi.c - Architecture-Specific Low-Level ACPI Support
3 *
4 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
5 * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
8 * Copyright (C) 2003 Pavel Machek, SuSE Labs
9 *
10 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 *
26 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/types.h>
32#include <linux/stddef.h>
33#include <linux/slab.h>
34#include <linux/pci.h>
35#include <linux/bootmem.h>
36#include <linux/acpi.h>
37#include <linux/cpumask.h>
38
39#include <asm/mpspec.h>
40#include <asm/io.h>
41#include <asm/apic.h>
42#include <asm/apicdef.h>
43#include <asm/page.h>
44#include <asm/pgtable.h>
45#include <asm/pgalloc.h>
46#include <asm/io_apic.h>
47#include <asm/proto.h>
48#include <asm/tlbflush.h>
49
50/* --------------------------------------------------------------------------
51 Low-Level Sleep Support
52 -------------------------------------------------------------------------- */
53
54/* address in low memory of the wakeup routine. */
55unsigned long acpi_wakeup_address = 0;
56unsigned long acpi_realmode_flags;
57extern char wakeup_start, wakeup_end;
58
59extern unsigned long acpi_copy_wakeup_routine(unsigned long);
60
61/**
62 * acpi_save_state_mem - save kernel state
63 *
64 * Create an identity mapped page table and copy the wakeup routine to
65 * low memory.
66 */
67int acpi_save_state_mem(void)
68{
69 memcpy((void *)acpi_wakeup_address, &wakeup_start,
70 &wakeup_end - &wakeup_start);
71 acpi_copy_wakeup_routine(acpi_wakeup_address);
72
73 return 0;
74}
75
76/*
77 * acpi_restore_state
78 */
79void acpi_restore_state_mem(void)
80{
81}
82
83/**
84 * acpi_reserve_bootmem - do _very_ early ACPI initialisation
85 *
86 * We allocate a page in low memory for the wakeup
87 * routine for when we come back from a sleep state. The
88 * runtime allocator allows specification of <16M pages, but not
89 * <1M pages.
90 */
91void __init acpi_reserve_bootmem(void)
92{
93 acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
94 if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2))
95 printk(KERN_CRIT
96 "ACPI: Wakeup code way too big, will crash on attempt"
97 " to suspend\n");
98}
99
100static int __init acpi_sleep_setup(char *str)
101{
102 while ((str != NULL) && (*str != '\0')) {
103 if (strncmp(str, "s3_bios", 7) == 0)
104 acpi_realmode_flags |= 1;
105 if (strncmp(str, "s3_mode", 7) == 0)
106 acpi_realmode_flags |= 2;
107 if (strncmp(str, "s3_beep", 7) == 0)
108 acpi_realmode_flags |= 4;
109 str = strchr(str, ',');
110 if (str != NULL)
111 str += strspn(str, ", \t");
112 }
113 return 1;
114}
115
116__setup("acpi_sleep=", acpi_sleep_setup);
117
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 1e931aaf2ef6..f53e3277f8e5 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
1.text 1 .section .text.page_aligned
2#include <linux/linkage.h> 2#include <linux/linkage.h>
3#include <asm/segment.h> 3#include <asm/segment.h>
4#include <asm/page.h> 4#include <asm/page.h>
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 5ed3bc5c61d7..2e1b9e0d0767 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -344,13 +344,13 @@ do_suspend_lowlevel:
344 call save_processor_state 344 call save_processor_state
345 345
346 movq $saved_context, %rax 346 movq $saved_context, %rax
347 movq %rsp, pt_regs_rsp(%rax) 347 movq %rsp, pt_regs_sp(%rax)
348 movq %rbp, pt_regs_rbp(%rax) 348 movq %rbp, pt_regs_bp(%rax)
349 movq %rsi, pt_regs_rsi(%rax) 349 movq %rsi, pt_regs_si(%rax)
350 movq %rdi, pt_regs_rdi(%rax) 350 movq %rdi, pt_regs_di(%rax)
351 movq %rbx, pt_regs_rbx(%rax) 351 movq %rbx, pt_regs_bx(%rax)
352 movq %rcx, pt_regs_rcx(%rax) 352 movq %rcx, pt_regs_cx(%rax)
353 movq %rdx, pt_regs_rdx(%rax) 353 movq %rdx, pt_regs_dx(%rax)
354 movq %r8, pt_regs_r8(%rax) 354 movq %r8, pt_regs_r8(%rax)
355 movq %r9, pt_regs_r9(%rax) 355 movq %r9, pt_regs_r9(%rax)
356 movq %r10, pt_regs_r10(%rax) 356 movq %r10, pt_regs_r10(%rax)
@@ -360,7 +360,7 @@ do_suspend_lowlevel:
360 movq %r14, pt_regs_r14(%rax) 360 movq %r14, pt_regs_r14(%rax)
361 movq %r15, pt_regs_r15(%rax) 361 movq %r15, pt_regs_r15(%rax)
362 pushfq 362 pushfq
363 popq pt_regs_eflags(%rax) 363 popq pt_regs_flags(%rax)
364 364
365 movq $.L97, saved_rip(%rip) 365 movq $.L97, saved_rip(%rip)
366 366
@@ -391,15 +391,15 @@ do_suspend_lowlevel:
391 movq %rbx, %cr2 391 movq %rbx, %cr2
392 movq saved_context_cr0(%rax), %rbx 392 movq saved_context_cr0(%rax), %rbx
393 movq %rbx, %cr0 393 movq %rbx, %cr0
394 pushq pt_regs_eflags(%rax) 394 pushq pt_regs_flags(%rax)
395 popfq 395 popfq
396 movq pt_regs_rsp(%rax), %rsp 396 movq pt_regs_sp(%rax), %rsp
397 movq pt_regs_rbp(%rax), %rbp 397 movq pt_regs_bp(%rax), %rbp
398 movq pt_regs_rsi(%rax), %rsi 398 movq pt_regs_si(%rax), %rsi
399 movq pt_regs_rdi(%rax), %rdi 399 movq pt_regs_di(%rax), %rdi
400 movq pt_regs_rbx(%rax), %rbx 400 movq pt_regs_bx(%rax), %rbx
401 movq pt_regs_rcx(%rax), %rcx 401 movq pt_regs_cx(%rax), %rcx
402 movq pt_regs_rdx(%rax), %rdx 402 movq pt_regs_dx(%rax), %rdx
403 movq pt_regs_r8(%rax), %r8 403 movq pt_regs_r8(%rax), %r8
404 movq pt_regs_r9(%rax), %r9 404 movq pt_regs_r9(%rax), %r9
405 movq pt_regs_r10(%rax), %r10 405 movq pt_regs_r10(%rax), %r10
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d6405e0842b5..45d79ea890ae 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -273,6 +273,7 @@ struct smp_alt_module {
273}; 273};
274static LIST_HEAD(smp_alt_modules); 274static LIST_HEAD(smp_alt_modules);
275static DEFINE_SPINLOCK(smp_alt); 275static DEFINE_SPINLOCK(smp_alt);
276static int smp_mode = 1; /* protected by smp_alt */
276 277
277void alternatives_smp_module_add(struct module *mod, char *name, 278void alternatives_smp_module_add(struct module *mod, char *name,
278 void *locks, void *locks_end, 279 void *locks, void *locks_end,
@@ -341,12 +342,13 @@ void alternatives_smp_switch(int smp)
341 342
342#ifdef CONFIG_LOCKDEP 343#ifdef CONFIG_LOCKDEP
343 /* 344 /*
344 * A not yet fixed binutils section handling bug prevents 345 * Older binutils section handling bug prevented
345 * alternatives-replacement from working reliably, so turn 346 * alternatives-replacement from working reliably.
346 * it off: 347 *
348 * If this still occurs then you should see a hang
349 * or crash shortly after this line:
347 */ 350 */
348 printk("lockdep: not fixing up alternatives.\n"); 351 printk("lockdep: fixing up alternatives.\n");
349 return;
350#endif 352#endif
351 353
352 if (noreplace_smp || smp_alt_once) 354 if (noreplace_smp || smp_alt_once)
@@ -354,21 +356,29 @@ void alternatives_smp_switch(int smp)
354 BUG_ON(!smp && (num_online_cpus() > 1)); 356 BUG_ON(!smp && (num_online_cpus() > 1));
355 357
356 spin_lock_irqsave(&smp_alt, flags); 358 spin_lock_irqsave(&smp_alt, flags);
357 if (smp) { 359
360 /*
361 * Avoid unnecessary switches because it forces JIT based VMs to
362 * throw away all cached translations, which can be quite costly.
363 */
364 if (smp == smp_mode) {
365 /* nothing */
366 } else if (smp) {
358 printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); 367 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
359 clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 368 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
360 clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 369 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
361 list_for_each_entry(mod, &smp_alt_modules, next) 370 list_for_each_entry(mod, &smp_alt_modules, next)
362 alternatives_smp_lock(mod->locks, mod->locks_end, 371 alternatives_smp_lock(mod->locks, mod->locks_end,
363 mod->text, mod->text_end); 372 mod->text, mod->text_end);
364 } else { 373 } else {
365 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 374 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
366 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 375 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
367 set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 376 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
368 list_for_each_entry(mod, &smp_alt_modules, next) 377 list_for_each_entry(mod, &smp_alt_modules, next)
369 alternatives_smp_unlock(mod->locks, mod->locks_end, 378 alternatives_smp_unlock(mod->locks, mod->locks_end,
370 mod->text, mod->text_end); 379 mod->text, mod->text_end);
371 } 380 }
381 smp_mode = smp;
372 spin_unlock_irqrestore(&smp_alt, flags); 382 spin_unlock_irqrestore(&smp_alt, flags);
373} 383}
374 384
@@ -431,8 +441,9 @@ void __init alternative_instructions(void)
431 if (smp_alt_once) { 441 if (smp_alt_once) {
432 if (1 == num_possible_cpus()) { 442 if (1 == num_possible_cpus()) {
433 printk(KERN_INFO "SMP alternatives: switching to UP code\n"); 443 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
434 set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); 444 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
435 set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); 445 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
446
436 alternatives_smp_unlock(__smp_locks, __smp_locks_end, 447 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
437 _text, _etext); 448 _text, _etext);
438 } 449 }
@@ -440,7 +451,10 @@ void __init alternative_instructions(void)
440 alternatives_smp_module_add(NULL, "core kernel", 451 alternatives_smp_module_add(NULL, "core kernel",
441 __smp_locks, __smp_locks_end, 452 __smp_locks, __smp_locks_end,
442 _text, _etext); 453 _text, _etext);
443 alternatives_smp_switch(0); 454
455 /* Only switch to UP mode if we don't immediately boot others */
456 if (num_possible_cpus() == 1 || setup_max_cpus <= 1)
457 alternatives_smp_switch(0);
444 } 458 }
445#endif 459#endif
446 apply_paravirt(__parainstructions, __parainstructions_end); 460 apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 5b6992799c9d..608152a2a05e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * Firmware replacement code. 2 * Firmware replacement code.
3 * 3 *
4 * Work around broken BIOSes that don't set an aperture or only set the 4 * Work around broken BIOSes that don't set an aperture or only set the
5 * aperture in the AGP bridge. 5 * aperture in the AGP bridge.
6 * If all fails map the aperture over some low memory. This is cheaper than 6 * If all fails map the aperture over some low memory. This is cheaper than
7 * doing bounce buffering. The memory is lost. This is done at early boot 7 * doing bounce buffering. The memory is lost. This is done at early boot
8 * because only the bootmem allocator can allocate 32+MB. 8 * because only the bootmem allocator can allocate 32+MB.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 */ 11 */
12#include <linux/kernel.h> 12#include <linux/kernel.h>
@@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0;
30int gart_iommu_aperture_allowed __initdata = 0; 30int gart_iommu_aperture_allowed __initdata = 0;
31 31
32int fallback_aper_order __initdata = 1; /* 64MB */ 32int fallback_aper_order __initdata = 1; /* 64MB */
33int fallback_aper_force __initdata = 0; 33int fallback_aper_force __initdata = 0;
34 34
35int fix_aperture __initdata = 1; 35int fix_aperture __initdata = 1;
36 36
@@ -49,167 +49,270 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
49/* This code runs before the PCI subsystem is initialized, so just 49/* This code runs before the PCI subsystem is initialized, so just
50 access the northbridge directly. */ 50 access the northbridge directly. */
51 51
52static u32 __init allocate_aperture(void) 52static u32 __init allocate_aperture(void)
53{ 53{
54 u32 aper_size; 54 u32 aper_size;
55 void *p; 55 void *p;
56 56
57 if (fallback_aper_order > 7) 57 if (fallback_aper_order > 7)
58 fallback_aper_order = 7; 58 fallback_aper_order = 7;
59 aper_size = (32 * 1024 * 1024) << fallback_aper_order; 59 aper_size = (32 * 1024 * 1024) << fallback_aper_order;
60 60
61 /* 61 /*
62 * Aperture has to be naturally aligned. This means an 2GB aperture won't 62 * Aperture has to be naturally aligned. This means a 2GB aperture
63 * have much chance of finding a place in the lower 4GB of memory. 63 * won't have much chance of finding a place in the lower 4GB of
64 * Unfortunately we cannot move it up because that would make the 64 * memory. Unfortunately we cannot move it up because that would
65 * IOMMU useless. 65 * make the IOMMU useless.
66 */ 66 */
67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); 67 p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
68 if (!p || __pa(p)+aper_size > 0xffffffff) { 68 if (!p || __pa(p)+aper_size > 0xffffffff) {
69 printk("Cannot allocate aperture memory hole (%p,%uK)\n", 69 printk(KERN_ERR
70 p, aper_size>>10); 70 "Cannot allocate aperture memory hole (%p,%uK)\n",
71 p, aper_size>>10);
71 if (p) 72 if (p)
72 free_bootmem(__pa(p), aper_size); 73 free_bootmem(__pa(p), aper_size);
73 return 0; 74 return 0;
74 } 75 }
75 printk("Mapping aperture over %d KB of RAM @ %lx\n", 76 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
76 aper_size >> 10, __pa(p)); 77 aper_size >> 10, __pa(p));
77 insert_aperture_resource((u32)__pa(p), aper_size); 78 insert_aperture_resource((u32)__pa(p), aper_size);
78 return (u32)__pa(p); 79
80 return (u32)__pa(p);
79} 81}
80 82
81static int __init aperture_valid(u64 aper_base, u32 aper_size) 83static int __init aperture_valid(u64 aper_base, u32 aper_size)
82{ 84{
83 if (!aper_base) 85 if (!aper_base)
84 return 0;
85 if (aper_size < 64*1024*1024) {
86 printk("Aperture too small (%d MB)\n", aper_size>>20);
87 return 0; 86 return 0;
88 } 87
89 if (aper_base + aper_size > 0x100000000UL) { 88 if (aper_base + aper_size > 0x100000000UL) {
90 printk("Aperture beyond 4GB. Ignoring.\n"); 89 printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
91 return 0; 90 return 0;
92 } 91 }
93 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { 92 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
94 printk("Aperture pointing to e820 RAM. Ignoring.\n"); 93 printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
95 return 0; 94 return 0;
96 } 95 }
96 if (aper_size < 64*1024*1024) {
97 printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
98 return 0;
99 }
100
97 return 1; 101 return 1;
98} 102}
99 103
100/* Find a PCI capability */ 104/* Find a PCI capability */
101static __u32 __init find_cap(int num, int slot, int func, int cap) 105static __u32 __init find_cap(int num, int slot, int func, int cap)
102{ 106{
103 u8 pos;
104 int bytes; 107 int bytes;
105 if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) 108 u8 pos;
109
110 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
111 PCI_STATUS_CAP_LIST))
106 return 0; 112 return 0;
107 pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); 113
108 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { 114 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
115 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
109 u8 id; 116 u8 id;
110 pos &= ~3; 117
111 id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); 118 pos &= ~3;
119 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
112 if (id == 0xff) 120 if (id == 0xff)
113 break; 121 break;
114 if (id == cap) 122 if (id == cap)
115 return pos; 123 return pos;
116 pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); 124 pos = read_pci_config_byte(num, slot, func,
117 } 125 pos+PCI_CAP_LIST_NEXT);
126 }
118 return 0; 127 return 0;
119} 128}
120 129
121/* Read a standard AGPv3 bridge header */ 130/* Read a standard AGPv3 bridge header */
122static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) 131static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
123{ 132{
124 u32 apsize; 133 u32 apsize;
125 u32 apsizereg; 134 u32 apsizereg;
126 int nbits; 135 int nbits;
127 u32 aper_low, aper_hi; 136 u32 aper_low, aper_hi;
128 u64 aper; 137 u64 aper;
129 138
130 printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); 139 printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
131 apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); 140 apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
132 if (apsizereg == 0xffffffff) { 141 if (apsizereg == 0xffffffff) {
133 printk("APSIZE in AGP bridge unreadable\n"); 142 printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
134 return 0; 143 return 0;
135 } 144 }
136 145
137 apsize = apsizereg & 0xfff; 146 apsize = apsizereg & 0xfff;
138 /* Some BIOS use weird encodings not in the AGPv3 table. */ 147 /* Some BIOS use weird encodings not in the AGPv3 table. */
139 if (apsize & 0xff) 148 if (apsize & 0xff)
140 apsize |= 0xf00; 149 apsize |= 0xf00;
141 nbits = hweight16(apsize); 150 nbits = hweight16(apsize);
142 *order = 7 - nbits; 151 *order = 7 - nbits;
143 if ((int)*order < 0) /* < 32MB */ 152 if ((int)*order < 0) /* < 32MB */
144 *order = 0; 153 *order = 0;
145 154
146 aper_low = read_pci_config(num,slot,func, 0x10); 155 aper_low = read_pci_config(num, slot, func, 0x10);
147 aper_hi = read_pci_config(num,slot,func,0x14); 156 aper_hi = read_pci_config(num, slot, func, 0x14);
148 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); 157 aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
149 158
150 printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", 159 printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
151 aper, 32 << *order, apsizereg); 160 aper, 32 << *order, apsizereg);
152 161
153 if (!aperture_valid(aper, (32*1024*1024) << *order)) 162 if (!aperture_valid(aper, (32*1024*1024) << *order))
154 return 0; 163 return 0;
155 return (u32)aper; 164 return (u32)aper;
156} 165}
157
158/* Look for an AGP bridge. Windows only expects the aperture in the
159 AGP bridge and some BIOS forget to initialize the Northbridge too.
160 Work around this here.
161
162 Do an PCI bus scan by hand because we're running before the PCI
163 subsystem.
164 166
165 All K8 AGP bridges are AGPv3 compliant, so we can do this scan 167/*
166 generically. It's probably overkill to always scan all slots because 168 * Look for an AGP bridge. Windows only expects the aperture in the
167 the AGP bridges should be always an own bus on the HT hierarchy, 169 * AGP bridge and some BIOS forget to initialize the Northbridge too.
168 but do it here for future safety. */ 170 * Work around this here.
171 *
172 * Do an PCI bus scan by hand because we're running before the PCI
173 * subsystem.
174 *
175 * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
176 * generically. It's probably overkill to always scan all slots because
177 * the AGP bridges should be always an own bus on the HT hierarchy,
178 * but do it here for future safety.
179 */
169static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) 180static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
170{ 181{
171 int num, slot, func; 182 int num, slot, func;
172 183
173 /* Poor man's PCI discovery */ 184 /* Poor man's PCI discovery */
174 for (num = 0; num < 256; num++) { 185 for (num = 0; num < 256; num++) {
175 for (slot = 0; slot < 32; slot++) { 186 for (slot = 0; slot < 32; slot++) {
176 for (func = 0; func < 8; func++) { 187 for (func = 0; func < 8; func++) {
177 u32 class, cap; 188 u32 class, cap;
178 u8 type; 189 u8 type;
179 class = read_pci_config(num,slot,func, 190 class = read_pci_config(num, slot, func,
180 PCI_CLASS_REVISION); 191 PCI_CLASS_REVISION);
181 if (class == 0xffffffff) 192 if (class == 0xffffffff)
182 break; 193 break;
183 194
184 switch (class >> 16) { 195 switch (class >> 16) {
185 case PCI_CLASS_BRIDGE_HOST: 196 case PCI_CLASS_BRIDGE_HOST:
186 case PCI_CLASS_BRIDGE_OTHER: /* needed? */ 197 case PCI_CLASS_BRIDGE_OTHER: /* needed? */
187 /* AGP bridge? */ 198 /* AGP bridge? */
188 cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); 199 cap = find_cap(num, slot, func,
200 PCI_CAP_ID_AGP);
189 if (!cap) 201 if (!cap)
190 break; 202 break;
191 *valid_agp = 1; 203 *valid_agp = 1;
192 return read_agp(num,slot,func,cap,order); 204 return read_agp(num, slot, func, cap,
193 } 205 order);
194 206 }
207
195 /* No multi-function device? */ 208 /* No multi-function device? */
196 type = read_pci_config_byte(num,slot,func, 209 type = read_pci_config_byte(num, slot, func,
197 PCI_HEADER_TYPE); 210 PCI_HEADER_TYPE);
198 if (!(type & 0x80)) 211 if (!(type & 0x80))
199 break; 212 break;
200 } 213 }
201 } 214 }
202 } 215 }
203 printk("No AGP bridge found\n"); 216 printk(KERN_INFO "No AGP bridge found\n");
217
204 return 0; 218 return 0;
205} 219}
206 220
221static int gart_fix_e820 __initdata = 1;
222
223static int __init parse_gart_mem(char *p)
224{
225 if (!p)
226 return -EINVAL;
227
228 if (!strncmp(p, "off", 3))
229 gart_fix_e820 = 0;
230 else if (!strncmp(p, "on", 2))
231 gart_fix_e820 = 1;
232
233 return 0;
234}
235early_param("gart_fix_e820", parse_gart_mem);
236
237void __init early_gart_iommu_check(void)
238{
239 /*
240 * in case it is enabled before, esp for kexec/kdump,
241 * previous kernel already enable that. memset called
242 * by allocate_aperture/__alloc_bootmem_nopanic cause restart.
243 * or second kernel have different position for GART hole. and new
244 * kernel could use hole as RAM that is still used by GART set by
245 * first kernel
246 * or BIOS forget to put that in reserved.
247 * try to update e820 to make that region as reserved.
248 */
249 int fix, num;
250 u32 ctl;
251 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
252 u64 aper_base = 0, last_aper_base = 0;
253 int aper_enabled = 0, last_aper_enabled = 0;
254
255 if (!early_pci_allowed())
256 return;
257
258 fix = 0;
259 for (num = 24; num < 32; num++) {
260 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
261 continue;
262
263 ctl = read_pci_config(0, num, 3, 0x90);
264 aper_enabled = ctl & 1;
265 aper_order = (ctl >> 1) & 7;
266 aper_size = (32 * 1024 * 1024) << aper_order;
267 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
268 aper_base <<= 25;
269
270 if ((last_aper_order && aper_order != last_aper_order) ||
271 (last_aper_base && aper_base != last_aper_base) ||
272 (last_aper_enabled && aper_enabled != last_aper_enabled)) {
273 fix = 1;
274 break;
275 }
276 last_aper_order = aper_order;
277 last_aper_base = aper_base;
278 last_aper_enabled = aper_enabled;
279 }
280
281 if (!fix && !aper_enabled)
282 return;
283
284 if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL)
285 fix = 1;
286
287 if (gart_fix_e820 && !fix && aper_enabled) {
288 if (e820_any_mapped(aper_base, aper_base + aper_size,
289 E820_RAM)) {
290 /* reserved it, so we can resuse it in second kernel */
291 printk(KERN_INFO "update e820 for GART\n");
292 add_memory_region(aper_base, aper_size, E820_RESERVED);
293 update_e820();
294 }
295 return;
296 }
297
298 /* different nodes have different setting, disable them all at first*/
299 for (num = 24; num < 32; num++) {
300 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
301 continue;
302
303 ctl = read_pci_config(0, num, 3, 0x90);
304 ctl &= ~1;
305 write_pci_config(0, num, 3, 0x90, ctl);
306 }
307
308}
309
207void __init gart_iommu_hole_init(void) 310void __init gart_iommu_hole_init(void)
208{ 311{
209 int fix, num;
210 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 312 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
211 u64 aper_base, last_aper_base = 0; 313 u64 aper_base, last_aper_base = 0;
212 int valid_agp = 0; 314 int fix, num, valid_agp = 0;
315 int node;
213 316
214 if (gart_iommu_aperture_disabled || !fix_aperture || 317 if (gart_iommu_aperture_disabled || !fix_aperture ||
215 !early_pci_allowed()) 318 !early_pci_allowed())
@@ -218,24 +321,26 @@ void __init gart_iommu_hole_init(void)
218 printk(KERN_INFO "Checking aperture...\n"); 321 printk(KERN_INFO "Checking aperture...\n");
219 322
220 fix = 0; 323 fix = 0;
221 for (num = 24; num < 32; num++) { 324 node = 0;
325 for (num = 24; num < 32; num++) {
222 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 326 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
223 continue; 327 continue;
224 328
225 iommu_detected = 1; 329 iommu_detected = 1;
226 gart_iommu_aperture = 1; 330 gart_iommu_aperture = 1;
227 331
228 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; 332 aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
229 aper_size = (32 * 1024 * 1024) << aper_order; 333 aper_size = (32 * 1024 * 1024) << aper_order;
230 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; 334 aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
231 aper_base <<= 25; 335 aper_base <<= 25;
336
337 printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
338 node, aper_base, aper_size >> 20);
339 node++;
232 340
233 printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
234 aper_base, aper_size>>20);
235
236 if (!aperture_valid(aper_base, aper_size)) { 341 if (!aperture_valid(aper_base, aper_size)) {
237 fix = 1; 342 fix = 1;
238 break; 343 break;
239 } 344 }
240 345
241 if ((last_aper_order && aper_order != last_aper_order) || 346 if ((last_aper_order && aper_order != last_aper_order) ||
@@ -245,55 +350,64 @@ void __init gart_iommu_hole_init(void)
245 } 350 }
246 last_aper_order = aper_order; 351 last_aper_order = aper_order;
247 last_aper_base = aper_base; 352 last_aper_base = aper_base;
248 } 353 }
249 354
250 if (!fix && !fallback_aper_force) { 355 if (!fix && !fallback_aper_force) {
251 if (last_aper_base) { 356 if (last_aper_base) {
252 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 357 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
358
253 insert_aperture_resource((u32)last_aper_base, n); 359 insert_aperture_resource((u32)last_aper_base, n);
254 } 360 }
255 return; 361 return;
256 } 362 }
257 363
258 if (!fallback_aper_force) 364 if (!fallback_aper_force)
259 aper_alloc = search_agp_bridge(&aper_order, &valid_agp); 365 aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
260 366
261 if (aper_alloc) { 367 if (aper_alloc) {
262 /* Got the aperture from the AGP bridge */ 368 /* Got the aperture from the AGP bridge */
263 } else if (swiotlb && !valid_agp) { 369 } else if (swiotlb && !valid_agp) {
264 /* Do nothing */ 370 /* Do nothing */
265 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || 371 } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) ||
266 force_iommu || 372 force_iommu ||
267 valid_agp || 373 valid_agp ||
268 fallback_aper_force) { 374 fallback_aper_force) {
269 printk("Your BIOS doesn't leave a aperture memory hole\n"); 375 printk(KERN_ERR
270 printk("Please enable the IOMMU option in the BIOS setup\n"); 376 "Your BIOS doesn't leave a aperture memory hole\n");
271 printk("This costs you %d MB of RAM\n", 377 printk(KERN_ERR
272 32 << fallback_aper_order); 378 "Please enable the IOMMU option in the BIOS setup\n");
379 printk(KERN_ERR
380 "This costs you %d MB of RAM\n",
381 32 << fallback_aper_order);
273 382
274 aper_order = fallback_aper_order; 383 aper_order = fallback_aper_order;
275 aper_alloc = allocate_aperture(); 384 aper_alloc = allocate_aperture();
276 if (!aper_alloc) { 385 if (!aper_alloc) {
277 /* Could disable AGP and IOMMU here, but it's probably 386 /*
278 not worth it. But the later users cannot deal with 387 * Could disable AGP and IOMMU here, but it's
279 bad apertures and turning on the aperture over memory 388 * probably not worth it. But the later users
280 causes very strange problems, so it's better to 389 * cannot deal with bad apertures and turning
281 panic early. */ 390 * on the aperture over memory causes very
391 * strange problems, so it's better to panic
392 * early.
393 */
282 panic("Not enough memory for aperture"); 394 panic("Not enough memory for aperture");
283 } 395 }
284 } else { 396 } else {
285 return; 397 return;
286 } 398 }
287 399
288 /* Fix up the north bridges */ 400 /* Fix up the north bridges */
289 for (num = 24; num < 32; num++) { 401 for (num = 24; num < 32; num++) {
290 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) 402 if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
291 continue; 403 continue;
292 404
293 /* Don't enable translation yet. That is done later. 405 /*
294 Assume this BIOS didn't initialise the GART so 406 * Don't enable translation yet. That is done later.
295 just overwrite all previous bits */ 407 * Assume this BIOS didn't initialise the GART so
296 write_pci_config(0, num, 3, 0x90, aper_order<<1); 408 * just overwrite all previous bits
297 write_pci_config(0, num, 3, 0x94, aper_alloc>>25); 409 */
298 } 410 write_pci_config(0, num, 3, 0x90, aper_order<<1);
299} 411 write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
412 }
413}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index edb5108e5d0e..35a568ea8400 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -43,12 +43,10 @@
43#include <mach_apicdef.h> 43#include <mach_apicdef.h>
44#include <mach_ipi.h> 44#include <mach_ipi.h>
45 45
46#include "io_ports.h"
47
48/* 46/*
49 * Sanity check 47 * Sanity check
50 */ 48 */
51#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F 49#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F)
52# error SPURIOUS_APIC_VECTOR definition error 50# error SPURIOUS_APIC_VECTOR definition error
53#endif 51#endif
54 52
@@ -57,7 +55,7 @@
57 * 55 *
58 * -1=force-disable, +1=force-enable 56 * -1=force-disable, +1=force-enable
59 */ 57 */
60static int enable_local_apic __initdata = 0; 58static int enable_local_apic __initdata;
61 59
62/* Local APIC timer verification ok */ 60/* Local APIC timer verification ok */
63static int local_apic_timer_verify_ok; 61static int local_apic_timer_verify_ok;
@@ -101,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
101/* Local APIC was disabled by the BIOS and enabled by the kernel */ 99/* Local APIC was disabled by the BIOS and enabled by the kernel */
102static int enabled_via_apicbase; 100static int enabled_via_apicbase;
103 101
102static unsigned long apic_phys;
103
104/* 104/*
105 * Get the LAPIC version 105 * Get the LAPIC version
106 */ 106 */
@@ -110,7 +110,7 @@ static inline int lapic_get_version(void)
110} 110}
111 111
112/* 112/*
113 * Check, if the APIC is integrated or a seperate chip 113 * Check, if the APIC is integrated or a separate chip
114 */ 114 */
115static inline int lapic_is_integrated(void) 115static inline int lapic_is_integrated(void)
116{ 116{
@@ -135,9 +135,9 @@ void apic_wait_icr_idle(void)
135 cpu_relax(); 135 cpu_relax();
136} 136}
137 137
138unsigned long safe_apic_wait_icr_idle(void) 138u32 safe_apic_wait_icr_idle(void)
139{ 139{
140 unsigned long send_status; 140 u32 send_status;
141 int timeout; 141 int timeout;
142 142
143 timeout = 0; 143 timeout = 0;
@@ -154,7 +154,7 @@ unsigned long safe_apic_wait_icr_idle(void)
154/** 154/**
155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 155 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
156 */ 156 */
157void enable_NMI_through_LVT0 (void * dummy) 157void __cpuinit enable_NMI_through_LVT0(void)
158{ 158{
159 unsigned int v = APIC_DM_NMI; 159 unsigned int v = APIC_DM_NMI;
160 160
@@ -379,8 +379,10 @@ void __init setup_boot_APIC_clock(void)
379 */ 379 */
380 if (local_apic_timer_disabled) { 380 if (local_apic_timer_disabled) {
381 /* No broadcast on UP ! */ 381 /* No broadcast on UP ! */
382 if (num_possible_cpus() > 1) 382 if (num_possible_cpus() > 1) {
383 lapic_clockevent.mult = 1;
383 setup_APIC_timer(); 384 setup_APIC_timer();
385 }
384 return; 386 return;
385 } 387 }
386 388
@@ -434,7 +436,7 @@ void __init setup_boot_APIC_clock(void)
434 "with PM Timer: %ldms instead of 100ms\n", 436 "with PM Timer: %ldms instead of 100ms\n",
435 (long)res); 437 (long)res);
436 /* Correct the lapic counter value */ 438 /* Correct the lapic counter value */
437 res = (((u64) delta ) * pm_100ms); 439 res = (((u64) delta) * pm_100ms);
438 do_div(res, deltapm); 440 do_div(res, deltapm);
439 printk(KERN_INFO "APIC delta adjusted to PM-Timer: " 441 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
440 "%lu (%ld)\n", (unsigned long) res, delta); 442 "%lu (%ld)\n", (unsigned long) res, delta);
@@ -472,6 +474,19 @@ void __init setup_boot_APIC_clock(void)
472 474
473 local_apic_timer_verify_ok = 1; 475 local_apic_timer_verify_ok = 1;
474 476
477 /*
478 * Do a sanity check on the APIC calibration result
479 */
480 if (calibration_result < (1000000 / HZ)) {
481 local_irq_enable();
482 printk(KERN_WARNING
483 "APIC frequency too slow, disabling apic timer\n");
484 /* No broadcast on UP ! */
485 if (num_possible_cpus() > 1)
486 setup_APIC_timer();
487 return;
488 }
489
475 /* We trust the pm timer based calibration */ 490 /* We trust the pm timer based calibration */
476 if (!pm_referenced) { 491 if (!pm_referenced) {
477 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 492 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
@@ -563,6 +578,9 @@ static void local_apic_timer_interrupt(void)
563 return; 578 return;
564 } 579 }
565 580
581 /*
582 * the NMI deadlock-detector uses this.
583 */
566 per_cpu(irq_stat, cpu).apic_timer_irqs++; 584 per_cpu(irq_stat, cpu).apic_timer_irqs++;
567 585
568 evt->event_handler(evt); 586 evt->event_handler(evt);
@@ -576,8 +594,7 @@ static void local_apic_timer_interrupt(void)
576 * [ if a single-CPU system runs an SMP kernel then we call the local 594 * [ if a single-CPU system runs an SMP kernel then we call the local
577 * interrupt as well. Thus we cannot inline the local irq ... ] 595 * interrupt as well. Thus we cannot inline the local irq ... ]
578 */ 596 */
579 597void smp_apic_timer_interrupt(struct pt_regs *regs)
580void fastcall smp_apic_timer_interrupt(struct pt_regs *regs)
581{ 598{
582 struct pt_regs *old_regs = set_irq_regs(regs); 599 struct pt_regs *old_regs = set_irq_regs(regs);
583 600
@@ -616,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier)
616 */ 633 */
617void clear_local_APIC(void) 634void clear_local_APIC(void)
618{ 635{
619 int maxlvt = lapic_get_maxlvt(); 636 int maxlvt;
620 unsigned long v; 637 u32 v;
638
639 /* APIC hasn't been mapped yet */
640 if (!apic_phys)
641 return;
621 642
643 maxlvt = lapic_get_maxlvt();
622 /* 644 /*
623 * Masking an LVT entry can trigger a local APIC error 645 * Masking an LVT entry can trigger a local APIC error
624 * if the vector is zero. Mask LVTERR first to prevent this. 646 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -976,7 +998,8 @@ void __cpuinit setup_local_APIC(void)
976 value |= APIC_LVT_LEVEL_TRIGGER; 998 value |= APIC_LVT_LEVEL_TRIGGER;
977 apic_write_around(APIC_LVT1, value); 999 apic_write_around(APIC_LVT1, value);
978 1000
979 if (integrated && !esr_disable) { /* !82489DX */ 1001 if (integrated && !esr_disable) {
1002 /* !82489DX */
980 maxlvt = lapic_get_maxlvt(); 1003 maxlvt = lapic_get_maxlvt();
981 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1004 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
982 apic_write(APIC_ESR, 0); 1005 apic_write(APIC_ESR, 0);
@@ -1020,7 +1043,7 @@ void __cpuinit setup_local_APIC(void)
1020/* 1043/*
1021 * Detect and initialize APIC 1044 * Detect and initialize APIC
1022 */ 1045 */
1023static int __init detect_init_APIC (void) 1046static int __init detect_init_APIC(void)
1024{ 1047{
1025 u32 h, l, features; 1048 u32 h, l, features;
1026 1049
@@ -1077,7 +1100,7 @@ static int __init detect_init_APIC (void)
1077 printk(KERN_WARNING "Could not enable APIC!\n"); 1100 printk(KERN_WARNING "Could not enable APIC!\n");
1078 return -1; 1101 return -1;
1079 } 1102 }
1080 set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1103 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1081 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1104 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1082 1105
1083 /* The BIOS may have set up the APIC at some other address */ 1106 /* The BIOS may have set up the APIC at some other address */
@@ -1104,8 +1127,6 @@ no_apic:
1104 */ 1127 */
1105void __init init_apic_mappings(void) 1128void __init init_apic_mappings(void)
1106{ 1129{
1107 unsigned long apic_phys;
1108
1109 /* 1130 /*
1110 * If no local APIC can be found then set up a fake all 1131 * If no local APIC can be found then set up a fake all
1111 * zeroes page to simulate the local APIC and another 1132 * zeroes page to simulate the local APIC and another
@@ -1164,10 +1185,10 @@ fake_ioapic_page:
1164 * This initializes the IO-APIC and APIC hardware if this is 1185 * This initializes the IO-APIC and APIC hardware if this is
1165 * a UP kernel. 1186 * a UP kernel.
1166 */ 1187 */
1167int __init APIC_init_uniprocessor (void) 1188int __init APIC_init_uniprocessor(void)
1168{ 1189{
1169 if (enable_local_apic < 0) 1190 if (enable_local_apic < 0)
1170 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1191 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1171 1192
1172 if (!smp_found_config && !cpu_has_apic) 1193 if (!smp_found_config && !cpu_has_apic)
1173 return -1; 1194 return -1;
@@ -1179,7 +1200,7 @@ int __init APIC_init_uniprocessor (void)
1179 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1200 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1180 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1201 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
1181 boot_cpu_physical_apicid); 1202 boot_cpu_physical_apicid);
1182 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1203 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1183 return -1; 1204 return -1;
1184 } 1205 }
1185 1206
@@ -1210,50 +1231,6 @@ int __init APIC_init_uniprocessor (void)
1210} 1231}
1211 1232
1212/* 1233/*
1213 * APIC command line parameters
1214 */
1215static int __init parse_lapic(char *arg)
1216{
1217 enable_local_apic = 1;
1218 return 0;
1219}
1220early_param("lapic", parse_lapic);
1221
1222static int __init parse_nolapic(char *arg)
1223{
1224 enable_local_apic = -1;
1225 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
1226 return 0;
1227}
1228early_param("nolapic", parse_nolapic);
1229
1230static int __init parse_disable_lapic_timer(char *arg)
1231{
1232 local_apic_timer_disabled = 1;
1233 return 0;
1234}
1235early_param("nolapic_timer", parse_disable_lapic_timer);
1236
1237static int __init parse_lapic_timer_c2_ok(char *arg)
1238{
1239 local_apic_timer_c2_ok = 1;
1240 return 0;
1241}
1242early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1243
1244static int __init apic_set_verbosity(char *str)
1245{
1246 if (strcmp("debug", str) == 0)
1247 apic_verbosity = APIC_DEBUG;
1248 else if (strcmp("verbose", str) == 0)
1249 apic_verbosity = APIC_VERBOSE;
1250 return 1;
1251}
1252
1253__setup("apic=", apic_set_verbosity);
1254
1255
1256/*
1257 * Local APIC interrupts 1234 * Local APIC interrupts
1258 */ 1235 */
1259 1236
@@ -1306,7 +1283,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1306 6: Received illegal vector 1283 6: Received illegal vector
1307 7: Illegal register address 1284 7: Illegal register address
1308 */ 1285 */
1309 printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1286 printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
1310 smp_processor_id(), v , v1); 1287 smp_processor_id(), v , v1);
1311 irq_exit(); 1288 irq_exit();
1312} 1289}
@@ -1393,7 +1370,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1393 value = apic_read(APIC_LVT0); 1370 value = apic_read(APIC_LVT0);
1394 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | 1371 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1395 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1372 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1396 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); 1373 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1397 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1374 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1398 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); 1375 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1399 apic_write_around(APIC_LVT0, value); 1376 apic_write_around(APIC_LVT0, value);
@@ -1530,7 +1507,7 @@ static int lapic_resume(struct sys_device *dev)
1530 */ 1507 */
1531 1508
1532static struct sysdev_class lapic_sysclass = { 1509static struct sysdev_class lapic_sysclass = {
1533 set_kset_name("lapic"), 1510 .name = "lapic",
1534 .resume = lapic_resume, 1511 .resume = lapic_resume,
1535 .suspend = lapic_suspend, 1512 .suspend = lapic_suspend,
1536}; 1513};
@@ -1565,3 +1542,46 @@ device_initcall(init_lapic_sysfs);
1565static void apic_pm_activate(void) { } 1542static void apic_pm_activate(void) { }
1566 1543
1567#endif /* CONFIG_PM */ 1544#endif /* CONFIG_PM */
1545
1546/*
1547 * APIC command line parameters
1548 */
1549static int __init parse_lapic(char *arg)
1550{
1551 enable_local_apic = 1;
1552 return 0;
1553}
1554early_param("lapic", parse_lapic);
1555
1556static int __init parse_nolapic(char *arg)
1557{
1558 enable_local_apic = -1;
1559 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1560 return 0;
1561}
1562early_param("nolapic", parse_nolapic);
1563
1564static int __init parse_disable_lapic_timer(char *arg)
1565{
1566 local_apic_timer_disabled = 1;
1567 return 0;
1568}
1569early_param("nolapic_timer", parse_disable_lapic_timer);
1570
1571static int __init parse_lapic_timer_c2_ok(char *arg)
1572{
1573 local_apic_timer_c2_ok = 1;
1574 return 0;
1575}
1576early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1577
1578static int __init apic_set_verbosity(char *str)
1579{
1580 if (strcmp("debug", str) == 0)
1581 apic_verbosity = APIC_DEBUG;
1582 else if (strcmp("verbose", str) == 0)
1583 apic_verbosity = APIC_VERBOSE;
1584 return 1;
1585}
1586__setup("apic=", apic_set_verbosity);
1587
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index f28ccb588fba..d8d03e09dea2 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -23,32 +23,37 @@
23#include <linux/mc146818rtc.h> 23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/module.h>
27#include <linux/ioport.h> 26#include <linux/ioport.h>
28#include <linux/clockchips.h> 27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
29 30
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/smp.h> 32#include <asm/smp.h>
32#include <asm/mtrr.h> 33#include <asm/mtrr.h>
33#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/hpet.h>
34#include <asm/pgalloc.h> 36#include <asm/pgalloc.h>
35#include <asm/mach_apic.h> 37#include <asm/mach_apic.h>
36#include <asm/nmi.h> 38#include <asm/nmi.h>
37#include <asm/idle.h> 39#include <asm/idle.h>
38#include <asm/proto.h> 40#include <asm/proto.h>
39#include <asm/timex.h> 41#include <asm/timex.h>
40#include <asm/hpet.h>
41#include <asm/apic.h> 42#include <asm/apic.h>
42 43
43int apic_verbosity;
44int disable_apic_timer __cpuinitdata; 44int disable_apic_timer __cpuinitdata;
45static int apic_calibrate_pmtmr __initdata; 45static int apic_calibrate_pmtmr __initdata;
46int disable_apic;
46 47
47/* Local APIC timer works in C2? */ 48/* Local APIC timer works in C2 */
48int local_apic_timer_c2_ok; 49int local_apic_timer_c2_ok;
49EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 50EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
50 51
51static struct resource *ioapic_resources; 52/*
53 * Debug level, exported for io_apic.c
54 */
55int apic_verbosity;
56
52static struct resource lapic_resource = { 57static struct resource lapic_resource = {
53 .name = "Local APIC", 58 .name = "Local APIC",
54 .flags = IORESOURCE_MEM | IORESOURCE_BUSY, 59 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -60,10 +65,8 @@ static int lapic_next_event(unsigned long delta,
60 struct clock_event_device *evt); 65 struct clock_event_device *evt);
61static void lapic_timer_setup(enum clock_event_mode mode, 66static void lapic_timer_setup(enum clock_event_mode mode,
62 struct clock_event_device *evt); 67 struct clock_event_device *evt);
63
64static void lapic_timer_broadcast(cpumask_t mask); 68static void lapic_timer_broadcast(cpumask_t mask);
65 69static void apic_pm_activate(void);
66static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen);
67 70
68static struct clock_event_device lapic_clockevent = { 71static struct clock_event_device lapic_clockevent = {
69 .name = "lapic", 72 .name = "lapic",
@@ -78,6 +81,150 @@ static struct clock_event_device lapic_clockevent = {
78}; 81};
79static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 82static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
80 83
84static unsigned long apic_phys;
85
86/*
87 * Get the LAPIC version
88 */
89static inline int lapic_get_version(void)
90{
91 return GET_APIC_VERSION(apic_read(APIC_LVR));
92}
93
94/*
95 * Check, if the APIC is integrated or a seperate chip
96 */
97static inline int lapic_is_integrated(void)
98{
99 return 1;
100}
101
102/*
103 * Check, whether this is a modern or a first generation APIC
104 */
105static int modern_apic(void)
106{
107 /* AMD systems use old APIC versions, so check the CPU */
108 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
109 boot_cpu_data.x86 >= 0xf)
110 return 1;
111 return lapic_get_version() >= 0x14;
112}
113
114void apic_wait_icr_idle(void)
115{
116 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
117 cpu_relax();
118}
119
120u32 safe_apic_wait_icr_idle(void)
121{
122 u32 send_status;
123 int timeout;
124
125 timeout = 0;
126 do {
127 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
128 if (!send_status)
129 break;
130 udelay(100);
131 } while (timeout++ < 1000);
132
133 return send_status;
134}
135
136/**
137 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
138 */
139void __cpuinit enable_NMI_through_LVT0(void)
140{
141 unsigned int v;
142
143 /* unmask and set to NMI */
144 v = APIC_DM_NMI;
145 apic_write(APIC_LVT0, v);
146}
147
148/**
149 * lapic_get_maxlvt - get the maximum number of local vector table entries
150 */
151int lapic_get_maxlvt(void)
152{
153 unsigned int v, maxlvt;
154
155 v = apic_read(APIC_LVR);
156 maxlvt = GET_APIC_MAXLVT(v);
157 return maxlvt;
158}
159
160/*
161 * This function sets up the local APIC timer, with a timeout of
162 * 'clocks' APIC bus clock. During calibration we actually call
163 * this function twice on the boot CPU, once with a bogus timeout
164 * value, second time for real. The other (noncalibrating) CPUs
165 * call this function only once, with the real, calibrated value.
166 *
167 * We do reads before writes even if unnecessary, to get around the
168 * P5 APIC double write bug.
169 */
170
171static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
172{
173 unsigned int lvtt_value, tmp_value;
174
175 lvtt_value = LOCAL_TIMER_VECTOR;
176 if (!oneshot)
177 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
178 if (!irqen)
179 lvtt_value |= APIC_LVT_MASKED;
180
181 apic_write(APIC_LVTT, lvtt_value);
182
183 /*
184 * Divide PICLK by 16
185 */
186 tmp_value = apic_read(APIC_TDCR);
187 apic_write(APIC_TDCR, (tmp_value
188 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
189 | APIC_TDR_DIV_16);
190
191 if (!oneshot)
192 apic_write(APIC_TMICT, clocks);
193}
194
195/*
196 * Setup extended LVT, AMD specific (K8, family 10h)
197 *
198 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
199 * MCE interrupts are supported. Thus MCE offset must be set to 0.
200 */
201
202#define APIC_EILVT_LVTOFF_MCE 0
203#define APIC_EILVT_LVTOFF_IBS 1
204
205static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
206{
207 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
208 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
209
210 apic_write(reg, v);
211}
212
213u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
214{
215 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
216 return APIC_EILVT_LVTOFF_MCE;
217}
218
219u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
220{
221 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
222 return APIC_EILVT_LVTOFF_IBS;
223}
224
225/*
226 * Program the next event, relative to now
227 */
81static int lapic_next_event(unsigned long delta, 228static int lapic_next_event(unsigned long delta,
82 struct clock_event_device *evt) 229 struct clock_event_device *evt)
83{ 230{
@@ -85,6 +232,9 @@ static int lapic_next_event(unsigned long delta,
85 return 0; 232 return 0;
86} 233}
87 234
235/*
236 * Setup the lapic timer in periodic or oneshot mode
237 */
88static void lapic_timer_setup(enum clock_event_mode mode, 238static void lapic_timer_setup(enum clock_event_mode mode,
89 struct clock_event_device *evt) 239 struct clock_event_device *evt)
90{ 240{
@@ -127,75 +277,261 @@ static void lapic_timer_broadcast(cpumask_t mask)
127#endif 277#endif
128} 278}
129 279
130static void apic_pm_activate(void); 280/*
281 * Setup the local APIC timer for this CPU. Copy the initilized values
282 * of the boot CPU and register the clock event in the framework.
283 */
284static void setup_APIC_timer(void)
285{
286 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
131 287
132void apic_wait_icr_idle(void) 288 memcpy(levt, &lapic_clockevent, sizeof(*levt));
289 levt->cpumask = cpumask_of_cpu(smp_processor_id());
290
291 clockevents_register_device(levt);
292}
293
294/*
295 * In this function we calibrate APIC bus clocks to the external
296 * timer. Unfortunately we cannot use jiffies and the timer irq
297 * to calibrate, since some later bootup code depends on getting
298 * the first irq? Ugh.
299 *
300 * We want to do the calibration only once since we
301 * want to have local timer irqs syncron. CPUs connected
302 * by the same APIC bus have the very same bus frequency.
303 * And we want to have irqs off anyways, no accidental
304 * APIC irq that way.
305 */
306
307#define TICK_COUNT 100000000
308
309static void __init calibrate_APIC_clock(void)
133{ 310{
134 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 311 unsigned apic, apic_start;
135 cpu_relax(); 312 unsigned long tsc, tsc_start;
313 int result;
314
315 local_irq_disable();
316
317 /*
318 * Put whatever arbitrary (but long enough) timeout
319 * value into the APIC clock, we just want to get the
320 * counter running for calibration.
321 *
322 * No interrupt enable !
323 */
324 __setup_APIC_LVTT(250000000, 0, 0);
325
326 apic_start = apic_read(APIC_TMCCT);
327#ifdef CONFIG_X86_PM_TIMER
328 if (apic_calibrate_pmtmr && pmtmr_ioport) {
329 pmtimer_wait(5000); /* 5ms wait */
330 apic = apic_read(APIC_TMCCT);
331 result = (apic_start - apic) * 1000L / 5;
332 } else
333#endif
334 {
335 rdtscll(tsc_start);
336
337 do {
338 apic = apic_read(APIC_TMCCT);
339 rdtscll(tsc);
340 } while ((tsc - tsc_start) < TICK_COUNT &&
341 (apic_start - apic) < TICK_COUNT);
342
343 result = (apic_start - apic) * 1000L * tsc_khz /
344 (tsc - tsc_start);
345 }
346
347 local_irq_enable();
348
349 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
350
351 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
352 result / 1000 / 1000, result / 1000 % 1000);
353
354 /* Calculate the scaled math multiplication factor */
355 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
356 lapic_clockevent.max_delta_ns =
357 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
358 lapic_clockevent.min_delta_ns =
359 clockevent_delta2ns(0xF, &lapic_clockevent);
360
361 calibration_result = result / HZ;
136} 362}
137 363
138unsigned int safe_apic_wait_icr_idle(void) 364/*
365 * Setup the boot APIC
366 *
367 * Calibrate and verify the result.
368 */
369void __init setup_boot_APIC_clock(void)
139{ 370{
140 unsigned int send_status; 371 /*
141 int timeout; 372 * The local apic timer can be disabled via the kernel commandline.
373 * Register the lapic timer as a dummy clock event source on SMP
374 * systems, so the broadcast mechanism is used. On UP systems simply
375 * ignore it.
376 */
377 if (disable_apic_timer) {
378 printk(KERN_INFO "Disabling APIC timer\n");
379 /* No broadcast on UP ! */
380 if (num_possible_cpus() > 1) {
381 lapic_clockevent.mult = 1;
382 setup_APIC_timer();
383 }
384 return;
385 }
142 386
143 timeout = 0; 387 printk(KERN_INFO "Using local APIC timer interrupts.\n");
144 do { 388 calibrate_APIC_clock();
145 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
146 if (!send_status)
147 break;
148 udelay(100);
149 } while (timeout++ < 1000);
150 389
151 return send_status; 390 /*
391 * Do a sanity check on the APIC calibration result
392 */
393 if (calibration_result < (1000000 / HZ)) {
394 printk(KERN_WARNING
395 "APIC frequency too slow, disabling apic timer\n");
396 /* No broadcast on UP ! */
397 if (num_possible_cpus() > 1)
398 setup_APIC_timer();
399 return;
400 }
401
402 /*
403 * If nmi_watchdog is set to IO_APIC, we need the
404 * PIT/HPET going. Otherwise register lapic as a dummy
405 * device.
406 */
407 if (nmi_watchdog != NMI_IO_APIC)
408 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
409 else
410 printk(KERN_WARNING "APIC timer registered as dummy,"
411 " due to nmi_watchdog=1!\n");
412
413 setup_APIC_timer();
152} 414}
153 415
154void enable_NMI_through_LVT0 (void * dummy) 416/*
417 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
418 * C1E flag only in the secondary CPU, so when we detect the wreckage
419 * we already have enabled the boot CPU local apic timer. Check, if
420 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
421 * set the DUMMY flag again and force the broadcast mode in the
422 * clockevents layer.
423 */
424void __cpuinit check_boot_apic_timer_broadcast(void)
155{ 425{
156 unsigned int v; 426 if (!disable_apic_timer ||
427 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
428 return;
157 429
158 /* unmask and set to NMI */ 430 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
159 v = APIC_DM_NMI; 431 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
160 apic_write(APIC_LVT0, v); 432
433 local_irq_enable();
434 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
435 local_irq_disable();
161} 436}
162 437
163int get_maxlvt(void) 438void __cpuinit setup_secondary_APIC_clock(void)
164{ 439{
165 unsigned int v, maxlvt; 440 check_boot_apic_timer_broadcast();
441 setup_APIC_timer();
442}
166 443
167 v = apic_read(APIC_LVR); 444/*
168 maxlvt = GET_APIC_MAXLVT(v); 445 * The guts of the apic timer interrupt
169 return maxlvt; 446 */
447static void local_apic_timer_interrupt(void)
448{
449 int cpu = smp_processor_id();
450 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
451
452 /*
453 * Normally we should not be here till LAPIC has been initialized but
454 * in some cases like kdump, its possible that there is a pending LAPIC
455 * timer interrupt from previous kernel's context and is delivered in
456 * new kernel the moment interrupts are enabled.
457 *
458 * Interrupts are enabled early and LAPIC is setup much later, hence
459 * its possible that when we get here evt->event_handler is NULL.
460 * Check for event_handler being NULL and discard the interrupt as
461 * spurious.
462 */
463 if (!evt->event_handler) {
464 printk(KERN_WARNING
465 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
466 /* Switch it off */
467 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
468 return;
469 }
470
471 /*
472 * the NMI deadlock-detector uses this.
473 */
474 add_pda(apic_timer_irqs, 1);
475
476 evt->event_handler(evt);
170} 477}
171 478
172/* 479/*
173 * 'what should we do if we get a hw irq event on an illegal vector'. 480 * Local APIC timer interrupt. This is the most natural way for doing
174 * each architecture has to answer this themselves. 481 * local interrupts, but local timer interrupts can be emulated by
482 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
483 *
484 * [ if a single-CPU system runs an SMP kernel then we call the local
485 * interrupt as well. Thus we cannot inline the local irq ... ]
175 */ 486 */
176void ack_bad_irq(unsigned int irq) 487void smp_apic_timer_interrupt(struct pt_regs *regs)
177{ 488{
178 printk("unexpected IRQ trap at vector %02x\n", irq); 489 struct pt_regs *old_regs = set_irq_regs(regs);
490
179 /* 491 /*
180 * Currently unexpected vectors happen only on SMP and APIC. 492 * NOTE! We'd better ACK the irq immediately,
181 * We _must_ ack these because every local APIC has only N 493 * because timer handling can be slow.
182 * irq slots per priority level, and a 'hanging, unacked' IRQ
183 * holds up an irq slot - in excessive cases (when multiple
184 * unexpected vectors occur) that might lock up the APIC
185 * completely.
186 * But don't ack when the APIC is disabled. -AK
187 */ 494 */
188 if (!disable_apic) 495 ack_APIC_irq();
189 ack_APIC_irq(); 496 /*
497 * update_process_times() expects us to have done irq_enter().
498 * Besides, if we don't timer interrupts ignore the global
499 * interrupt lock, which is the WrongThing (tm) to do.
500 */
501 exit_idle();
502 irq_enter();
503 local_apic_timer_interrupt();
504 irq_exit();
505 set_irq_regs(old_regs);
506}
507
508int setup_profiling_timer(unsigned int multiplier)
509{
510 return -EINVAL;
190} 511}
191 512
513
514/*
515 * Local APIC start and shutdown
516 */
517
518/**
519 * clear_local_APIC - shutdown the local APIC
520 *
521 * This is called, when a CPU is disabled and before rebooting, so the state of
522 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
523 * leftovers during boot.
524 */
192void clear_local_APIC(void) 525void clear_local_APIC(void)
193{ 526{
194 int maxlvt; 527 int maxlvt = lapic_get_maxlvt();
195 unsigned int v; 528 u32 v;
196 529
197 maxlvt = get_maxlvt(); 530 /* APIC hasn't been mapped yet */
531 if (!apic_phys)
532 return;
198 533
534 maxlvt = lapic_get_maxlvt();
199 /* 535 /*
200 * Masking an LVT entry can trigger a local APIC error 536 * Masking an LVT entry can trigger a local APIC error
201 * if the vector is zero. Mask LVTERR first to prevent this. 537 * if the vector is zero. Mask LVTERR first to prevent this.
@@ -233,45 +569,9 @@ void clear_local_APIC(void)
233 apic_read(APIC_ESR); 569 apic_read(APIC_ESR);
234} 570}
235 571
236void disconnect_bsp_APIC(int virt_wire_setup) 572/**
237{ 573 * disable_local_APIC - clear and disable the local APIC
238 /* Go back to Virtual Wire compatibility mode */ 574 */
239 unsigned long value;
240
241 /* For the spurious interrupt use vector F, and enable it */
242 value = apic_read(APIC_SPIV);
243 value &= ~APIC_VECTOR_MASK;
244 value |= APIC_SPIV_APIC_ENABLED;
245 value |= 0xf;
246 apic_write(APIC_SPIV, value);
247
248 if (!virt_wire_setup) {
249 /*
250 * For LVT0 make it edge triggered, active high,
251 * external and enabled
252 */
253 value = apic_read(APIC_LVT0);
254 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
255 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
256 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
257 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
258 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
259 apic_write(APIC_LVT0, value);
260 } else {
261 /* Disable LVT0 */
262 apic_write(APIC_LVT0, APIC_LVT_MASKED);
263 }
264
265 /* For LVT1 make it edge triggered, active high, nmi and enabled */
266 value = apic_read(APIC_LVT1);
267 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
268 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
269 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
270 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
271 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
272 apic_write(APIC_LVT1, value);
273}
274
275void disable_local_APIC(void) 575void disable_local_APIC(void)
276{ 576{
277 unsigned int value; 577 unsigned int value;
@@ -333,7 +633,7 @@ int __init verify_local_APIC(void)
333 reg1 = GET_APIC_VERSION(reg0); 633 reg1 = GET_APIC_VERSION(reg0);
334 if (reg1 == 0x00 || reg1 == 0xff) 634 if (reg1 == 0x00 || reg1 == 0xff)
335 return 0; 635 return 0;
336 reg1 = get_maxlvt(); 636 reg1 = lapic_get_maxlvt();
337 if (reg1 < 0x02 || reg1 == 0xff) 637 if (reg1 < 0x02 || reg1 == 0xff)
338 return 0; 638 return 0;
339 639
@@ -355,18 +655,20 @@ int __init verify_local_APIC(void)
355 * compatibility mode, but most boxes are anymore. 655 * compatibility mode, but most boxes are anymore.
356 */ 656 */
357 reg0 = apic_read(APIC_LVT0); 657 reg0 = apic_read(APIC_LVT0);
358 apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); 658 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
359 reg1 = apic_read(APIC_LVT1); 659 reg1 = apic_read(APIC_LVT1);
360 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); 660 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
361 661
362 return 1; 662 return 1;
363} 663}
364 664
665/**
666 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
667 */
365void __init sync_Arb_IDs(void) 668void __init sync_Arb_IDs(void)
366{ 669{
367 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ 670 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
368 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); 671 if (modern_apic())
369 if (ver >= 0x14) /* P4 or higher */
370 return; 672 return;
371 673
372 /* 674 /*
@@ -418,9 +720,12 @@ void __init init_bsp_APIC(void)
418 apic_write(APIC_LVT1, value); 720 apic_write(APIC_LVT1, value);
419} 721}
420 722
421void __cpuinit setup_local_APIC (void) 723/**
724 * setup_local_APIC - setup the local APIC
725 */
726void __cpuinit setup_local_APIC(void)
422{ 727{
423 unsigned int value, maxlvt; 728 unsigned int value;
424 int i, j; 729 int i, j;
425 730
426 value = apic_read(APIC_LVR); 731 value = apic_read(APIC_LVR);
@@ -516,30 +821,217 @@ void __cpuinit setup_local_APIC (void)
516 else 821 else
517 value = APIC_DM_NMI | APIC_LVT_MASKED; 822 value = APIC_DM_NMI | APIC_LVT_MASKED;
518 apic_write(APIC_LVT1, value); 823 apic_write(APIC_LVT1, value);
824}
519 825
520 { 826void __cpuinit lapic_setup_esr(void)
521 unsigned oldvalue; 827{
522 maxlvt = get_maxlvt(); 828 unsigned maxlvt = lapic_get_maxlvt();
523 oldvalue = apic_read(APIC_ESR); 829
524 value = ERROR_APIC_VECTOR; // enables sending errors 830 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
525 apic_write(APIC_LVTERR, value); 831 /*
526 /* 832 * spec says clear errors after enabling vector.
527 * spec says clear errors after enabling vector. 833 */
528 */ 834 if (maxlvt > 3)
529 if (maxlvt > 3) 835 apic_write(APIC_ESR, 0);
530 apic_write(APIC_ESR, 0); 836}
531 value = apic_read(APIC_ESR);
532 if (value != oldvalue)
533 apic_printk(APIC_VERBOSE,
534 "ESR value after enabling vector: %08x, after %08x\n",
535 oldvalue, value);
536 }
537 837
838void __cpuinit end_local_APIC_setup(void)
839{
840 lapic_setup_esr();
538 nmi_watchdog_default(); 841 nmi_watchdog_default();
539 setup_apic_nmi_watchdog(NULL); 842 setup_apic_nmi_watchdog(NULL);
540 apic_pm_activate(); 843 apic_pm_activate();
541} 844}
542 845
846/*
847 * Detect and enable local APICs on non-SMP boards.
848 * Original code written by Keir Fraser.
849 * On AMD64 we trust the BIOS - if it says no APIC it is likely
850 * not correctly set up (usually the APIC timer won't work etc.)
851 */
852static int __init detect_init_APIC(void)
853{
854 if (!cpu_has_apic) {
855 printk(KERN_INFO "No local APIC present\n");
856 return -1;
857 }
858
859 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
860 boot_cpu_id = 0;
861 return 0;
862}
863
864/**
865 * init_apic_mappings - initialize APIC mappings
866 */
867void __init init_apic_mappings(void)
868{
869 /*
870 * If no local APIC can be found then set up a fake all
871 * zeroes page to simulate the local APIC and another
872 * one for the IO-APIC.
873 */
874 if (!smp_found_config && detect_init_APIC()) {
875 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
876 apic_phys = __pa(apic_phys);
877 } else
878 apic_phys = mp_lapic_addr;
879
880 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
881 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
882 APIC_BASE, apic_phys);
883
884 /* Put local APIC into the resource map. */
885 lapic_resource.start = apic_phys;
886 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
887 insert_resource(&iomem_resource, &lapic_resource);
888
889 /*
890 * Fetch the APIC ID of the BSP in case we have a
891 * default configuration (or the MP table is broken).
892 */
893 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
894}
895
896/*
897 * This initializes the IO-APIC and APIC hardware if this is
898 * a UP kernel.
899 */
900int __init APIC_init_uniprocessor(void)
901{
902 if (disable_apic) {
903 printk(KERN_INFO "Apic disabled\n");
904 return -1;
905 }
906 if (!cpu_has_apic) {
907 disable_apic = 1;
908 printk(KERN_INFO "Apic disabled by BIOS\n");
909 return -1;
910 }
911
912 verify_local_APIC();
913
914 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
915 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
916
917 setup_local_APIC();
918
919 /*
920 * Now enable IO-APICs, actually call clear_IO_APIC
921 * We need clear_IO_APIC before enabling vector on BP
922 */
923 if (!skip_ioapic_setup && nr_ioapics)
924 enable_IO_APIC();
925
926 end_local_APIC_setup();
927
928 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
929 setup_IO_APIC();
930 else
931 nr_ioapics = 0;
932 setup_boot_APIC_clock();
933 check_nmi_watchdog();
934 return 0;
935}
936
937/*
938 * Local APIC interrupts
939 */
940
941/*
942 * This interrupt should _never_ happen with our APIC/SMP architecture
943 */
944asmlinkage void smp_spurious_interrupt(void)
945{
946 unsigned int v;
947 exit_idle();
948 irq_enter();
949 /*
950 * Check if this really is a spurious interrupt and ACK it
951 * if it is a vectored one. Just in case...
952 * Spurious interrupts should not be ACKed.
953 */
954 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
955 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
956 ack_APIC_irq();
957
958 add_pda(irq_spurious_count, 1);
959 irq_exit();
960}
961
962/*
963 * This interrupt should never happen with our APIC/SMP architecture
964 */
965asmlinkage void smp_error_interrupt(void)
966{
967 unsigned int v, v1;
968
969 exit_idle();
970 irq_enter();
971 /* First tickle the hardware, only then report what went on. -- REW */
972 v = apic_read(APIC_ESR);
973 apic_write(APIC_ESR, 0);
974 v1 = apic_read(APIC_ESR);
975 ack_APIC_irq();
976 atomic_inc(&irq_err_count);
977
978 /* Here is what the APIC error bits mean:
979 0: Send CS error
980 1: Receive CS error
981 2: Send accept error
982 3: Receive accept error
983 4: Reserved
984 5: Send illegal vector
985 6: Received illegal vector
986 7: Illegal register address
987 */
988 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
989 smp_processor_id(), v , v1);
990 irq_exit();
991}
992
993void disconnect_bsp_APIC(int virt_wire_setup)
994{
995 /* Go back to Virtual Wire compatibility mode */
996 unsigned long value;
997
998 /* For the spurious interrupt use vector F, and enable it */
999 value = apic_read(APIC_SPIV);
1000 value &= ~APIC_VECTOR_MASK;
1001 value |= APIC_SPIV_APIC_ENABLED;
1002 value |= 0xf;
1003 apic_write(APIC_SPIV, value);
1004
1005 if (!virt_wire_setup) {
1006 /*
1007 * For LVT0 make it edge triggered, active high,
1008 * external and enabled
1009 */
1010 value = apic_read(APIC_LVT0);
1011 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1012 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1013 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1014 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1015 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1016 apic_write(APIC_LVT0, value);
1017 } else {
1018 /* Disable LVT0 */
1019 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1020 }
1021
1022 /* For LVT1 make it edge triggered, active high, nmi and enabled */
1023 value = apic_read(APIC_LVT1);
1024 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1025 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1026 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1027 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1028 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1029 apic_write(APIC_LVT1, value);
1030}
1031
1032/*
1033 * Power management
1034 */
543#ifdef CONFIG_PM 1035#ifdef CONFIG_PM
544 1036
545static struct { 1037static struct {
@@ -571,7 +1063,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
571 if (!apic_pm_state.active) 1063 if (!apic_pm_state.active)
572 return 0; 1064 return 0;
573 1065
574 maxlvt = get_maxlvt(); 1066 maxlvt = lapic_get_maxlvt();
575 1067
576 apic_pm_state.apic_id = apic_read(APIC_ID); 1068 apic_pm_state.apic_id = apic_read(APIC_ID);
577 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 1069 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
@@ -605,7 +1097,7 @@ static int lapic_resume(struct sys_device *dev)
605 if (!apic_pm_state.active) 1097 if (!apic_pm_state.active)
606 return 0; 1098 return 0;
607 1099
608 maxlvt = get_maxlvt(); 1100 maxlvt = lapic_get_maxlvt();
609 1101
610 local_irq_save(flags); 1102 local_irq_save(flags);
611 rdmsr(MSR_IA32_APICBASE, l, h); 1103 rdmsr(MSR_IA32_APICBASE, l, h);
@@ -639,14 +1131,14 @@ static int lapic_resume(struct sys_device *dev)
639} 1131}
640 1132
641static struct sysdev_class lapic_sysclass = { 1133static struct sysdev_class lapic_sysclass = {
642 set_kset_name("lapic"), 1134 .name = "lapic",
643 .resume = lapic_resume, 1135 .resume = lapic_resume,
644 .suspend = lapic_suspend, 1136 .suspend = lapic_suspend,
645}; 1137};
646 1138
647static struct sys_device device_lapic = { 1139static struct sys_device device_lapic = {
648 .id = 0, 1140 .id = 0,
649 .cls = &lapic_sysclass, 1141 .cls = &lapic_sysclass,
650}; 1142};
651 1143
652static void __cpuinit apic_pm_activate(void) 1144static void __cpuinit apic_pm_activate(void)
@@ -657,9 +1149,11 @@ static void __cpuinit apic_pm_activate(void)
657static int __init init_lapic_sysfs(void) 1149static int __init init_lapic_sysfs(void)
658{ 1150{
659 int error; 1151 int error;
1152
660 if (!cpu_has_apic) 1153 if (!cpu_has_apic)
661 return 0; 1154 return 0;
662 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ 1155 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1156
663 error = sysdev_class_register(&lapic_sysclass); 1157 error = sysdev_class_register(&lapic_sysclass);
664 if (!error) 1158 if (!error)
665 error = sysdev_register(&device_lapic); 1159 error = sysdev_register(&device_lapic);
@@ -673,423 +1167,6 @@ static void apic_pm_activate(void) { }
673 1167
674#endif /* CONFIG_PM */ 1168#endif /* CONFIG_PM */
675 1169
676static int __init apic_set_verbosity(char *str)
677{
678 if (str == NULL) {
679 skip_ioapic_setup = 0;
680 ioapic_force = 1;
681 return 0;
682 }
683 if (strcmp("debug", str) == 0)
684 apic_verbosity = APIC_DEBUG;
685 else if (strcmp("verbose", str) == 0)
686 apic_verbosity = APIC_VERBOSE;
687 else {
688 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
689 " use apic=verbose or apic=debug\n", str);
690 return -EINVAL;
691 }
692
693 return 0;
694}
695early_param("apic", apic_set_verbosity);
696
697/*
698 * Detect and enable local APICs on non-SMP boards.
699 * Original code written by Keir Fraser.
700 * On AMD64 we trust the BIOS - if it says no APIC it is likely
701 * not correctly set up (usually the APIC timer won't work etc.)
702 */
703
704static int __init detect_init_APIC (void)
705{
706 if (!cpu_has_apic) {
707 printk(KERN_INFO "No local APIC present\n");
708 return -1;
709 }
710
711 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
712 boot_cpu_id = 0;
713 return 0;
714}
715
716#ifdef CONFIG_X86_IO_APIC
717static struct resource * __init ioapic_setup_resources(void)
718{
719#define IOAPIC_RESOURCE_NAME_SIZE 11
720 unsigned long n;
721 struct resource *res;
722 char *mem;
723 int i;
724
725 if (nr_ioapics <= 0)
726 return NULL;
727
728 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
729 n *= nr_ioapics;
730
731 mem = alloc_bootmem(n);
732 res = (void *)mem;
733
734 if (mem != NULL) {
735 memset(mem, 0, n);
736 mem += sizeof(struct resource) * nr_ioapics;
737
738 for (i = 0; i < nr_ioapics; i++) {
739 res[i].name = mem;
740 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
741 sprintf(mem, "IOAPIC %u", i);
742 mem += IOAPIC_RESOURCE_NAME_SIZE;
743 }
744 }
745
746 ioapic_resources = res;
747
748 return res;
749}
750
751static int __init ioapic_insert_resources(void)
752{
753 int i;
754 struct resource *r = ioapic_resources;
755
756 if (!r) {
757 printk("IO APIC resources could be not be allocated.\n");
758 return -1;
759 }
760
761 for (i = 0; i < nr_ioapics; i++) {
762 insert_resource(&iomem_resource, r);
763 r++;
764 }
765
766 return 0;
767}
768
769/* Insert the IO APIC resources after PCI initialization has occured to handle
770 * IO APICS that are mapped in on a BAR in PCI space. */
771late_initcall(ioapic_insert_resources);
772#endif
773
774void __init init_apic_mappings(void)
775{
776 unsigned long apic_phys;
777
778 /*
779 * If no local APIC can be found then set up a fake all
780 * zeroes page to simulate the local APIC and another
781 * one for the IO-APIC.
782 */
783 if (!smp_found_config && detect_init_APIC()) {
784 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
785 apic_phys = __pa(apic_phys);
786 } else
787 apic_phys = mp_lapic_addr;
788
789 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
790 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
791 APIC_BASE, apic_phys);
792
793 /* Put local APIC into the resource map. */
794 lapic_resource.start = apic_phys;
795 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
796 insert_resource(&iomem_resource, &lapic_resource);
797
798 /*
799 * Fetch the APIC ID of the BSP in case we have a
800 * default configuration (or the MP table is broken).
801 */
802 boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
803
804 {
805 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
806 int i;
807 struct resource *ioapic_res;
808
809 ioapic_res = ioapic_setup_resources();
810 for (i = 0; i < nr_ioapics; i++) {
811 if (smp_found_config) {
812 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
813 } else {
814 ioapic_phys = (unsigned long)
815 alloc_bootmem_pages(PAGE_SIZE);
816 ioapic_phys = __pa(ioapic_phys);
817 }
818 set_fixmap_nocache(idx, ioapic_phys);
819 apic_printk(APIC_VERBOSE,
820 "mapped IOAPIC to %016lx (%016lx)\n",
821 __fix_to_virt(idx), ioapic_phys);
822 idx++;
823
824 if (ioapic_res != NULL) {
825 ioapic_res->start = ioapic_phys;
826 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
827 ioapic_res++;
828 }
829 }
830 }
831}
832
833/*
834 * This function sets up the local APIC timer, with a timeout of
835 * 'clocks' APIC bus clock. During calibration we actually call
836 * this function twice on the boot CPU, once with a bogus timeout
837 * value, second time for real. The other (noncalibrating) CPUs
838 * call this function only once, with the real, calibrated value.
839 *
840 * We do reads before writes even if unnecessary, to get around the
841 * P5 APIC double write bug.
842 */
843
844static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
845{
846 unsigned int lvtt_value, tmp_value;
847
848 lvtt_value = LOCAL_TIMER_VECTOR;
849 if (!oneshot)
850 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
851 if (!irqen)
852 lvtt_value |= APIC_LVT_MASKED;
853
854 apic_write(APIC_LVTT, lvtt_value);
855
856 /*
857 * Divide PICLK by 16
858 */
859 tmp_value = apic_read(APIC_TDCR);
860 apic_write(APIC_TDCR, (tmp_value
861 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
862 | APIC_TDR_DIV_16);
863
864 if (!oneshot)
865 apic_write(APIC_TMICT, clocks);
866}
867
868static void setup_APIC_timer(void)
869{
870 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
871
872 memcpy(levt, &lapic_clockevent, sizeof(*levt));
873 levt->cpumask = cpumask_of_cpu(smp_processor_id());
874
875 clockevents_register_device(levt);
876}
877
878/*
879 * In this function we calibrate APIC bus clocks to the external
880 * timer. Unfortunately we cannot use jiffies and the timer irq
881 * to calibrate, since some later bootup code depends on getting
882 * the first irq? Ugh.
883 *
884 * We want to do the calibration only once since we
885 * want to have local timer irqs syncron. CPUs connected
886 * by the same APIC bus have the very same bus frequency.
887 * And we want to have irqs off anyways, no accidental
888 * APIC irq that way.
889 */
890
891#define TICK_COUNT 100000000
892
893static void __init calibrate_APIC_clock(void)
894{
895 unsigned apic, apic_start;
896 unsigned long tsc, tsc_start;
897 int result;
898
899 local_irq_disable();
900
901 /*
902 * Put whatever arbitrary (but long enough) timeout
903 * value into the APIC clock, we just want to get the
904 * counter running for calibration.
905 *
906 * No interrupt enable !
907 */
908 __setup_APIC_LVTT(250000000, 0, 0);
909
910 apic_start = apic_read(APIC_TMCCT);
911#ifdef CONFIG_X86_PM_TIMER
912 if (apic_calibrate_pmtmr && pmtmr_ioport) {
913 pmtimer_wait(5000); /* 5ms wait */
914 apic = apic_read(APIC_TMCCT);
915 result = (apic_start - apic) * 1000L / 5;
916 } else
917#endif
918 {
919 rdtscll(tsc_start);
920
921 do {
922 apic = apic_read(APIC_TMCCT);
923 rdtscll(tsc);
924 } while ((tsc - tsc_start) < TICK_COUNT &&
925 (apic_start - apic) < TICK_COUNT);
926
927 result = (apic_start - apic) * 1000L * tsc_khz /
928 (tsc - tsc_start);
929 }
930
931 local_irq_enable();
932
933 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
934
935 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
936 result / 1000 / 1000, result / 1000 % 1000);
937
938 /* Calculate the scaled math multiplication factor */
939 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
940 lapic_clockevent.max_delta_ns =
941 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
942 lapic_clockevent.min_delta_ns =
943 clockevent_delta2ns(0xF, &lapic_clockevent);
944
945 calibration_result = result / HZ;
946}
947
948void __init setup_boot_APIC_clock (void)
949{
950 /*
951 * The local apic timer can be disabled via the kernel commandline.
952 * Register the lapic timer as a dummy clock event source on SMP
953 * systems, so the broadcast mechanism is used. On UP systems simply
954 * ignore it.
955 */
956 if (disable_apic_timer) {
957 printk(KERN_INFO "Disabling APIC timer\n");
958 /* No broadcast on UP ! */
959 if (num_possible_cpus() > 1)
960 setup_APIC_timer();
961 return;
962 }
963
964 printk(KERN_INFO "Using local APIC timer interrupts.\n");
965 calibrate_APIC_clock();
966
967 /*
968 * If nmi_watchdog is set to IO_APIC, we need the
969 * PIT/HPET going. Otherwise register lapic as a dummy
970 * device.
971 */
972 if (nmi_watchdog != NMI_IO_APIC)
973 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
974 else
975 printk(KERN_WARNING "APIC timer registered as dummy,"
976 " due to nmi_watchdog=1!\n");
977
978 setup_APIC_timer();
979}
980
981/*
982 * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
983 * C1E flag only in the secondary CPU, so when we detect the wreckage
984 * we already have enabled the boot CPU local apic timer. Check, if
985 * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
986 * set the DUMMY flag again and force the broadcast mode in the
987 * clockevents layer.
988 */
989void __cpuinit check_boot_apic_timer_broadcast(void)
990{
991 if (!disable_apic_timer ||
992 (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
993 return;
994
995 printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
996 lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
997
998 local_irq_enable();
999 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
1000 local_irq_disable();
1001}
1002
1003void __cpuinit setup_secondary_APIC_clock(void)
1004{
1005 check_boot_apic_timer_broadcast();
1006 setup_APIC_timer();
1007}
1008
1009int setup_profiling_timer(unsigned int multiplier)
1010{
1011 return -EINVAL;
1012}
1013
1014void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector,
1015 unsigned char msg_type, unsigned char mask)
1016{
1017 unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
1018 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
1019 apic_write(reg, v);
1020}
1021
1022/*
1023 * Local timer interrupt handler. It does both profiling and
1024 * process statistics/rescheduling.
1025 *
1026 * We do profiling in every local tick, statistics/rescheduling
1027 * happen only every 'profiling multiplier' ticks. The default
1028 * multiplier is 1 and it can be changed by writing the new multiplier
1029 * value into /proc/profile.
1030 */
1031
1032void smp_local_timer_interrupt(void)
1033{
1034 int cpu = smp_processor_id();
1035 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
1036
1037 /*
1038 * Normally we should not be here till LAPIC has been initialized but
1039 * in some cases like kdump, its possible that there is a pending LAPIC
1040 * timer interrupt from previous kernel's context and is delivered in
1041 * new kernel the moment interrupts are enabled.
1042 *
1043 * Interrupts are enabled early and LAPIC is setup much later, hence
1044 * its possible that when we get here evt->event_handler is NULL.
1045 * Check for event_handler being NULL and discard the interrupt as
1046 * spurious.
1047 */
1048 if (!evt->event_handler) {
1049 printk(KERN_WARNING
1050 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
1051 /* Switch it off */
1052 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
1053 return;
1054 }
1055
1056 /*
1057 * the NMI deadlock-detector uses this.
1058 */
1059 add_pda(apic_timer_irqs, 1);
1060
1061 evt->event_handler(evt);
1062}
1063
1064/*
1065 * Local APIC timer interrupt. This is the most natural way for doing
1066 * local interrupts, but local timer interrupts can be emulated by
1067 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
1068 *
1069 * [ if a single-CPU system runs an SMP kernel then we call the local
1070 * interrupt as well. Thus we cannot inline the local irq ... ]
1071 */
1072void smp_apic_timer_interrupt(struct pt_regs *regs)
1073{
1074 struct pt_regs *old_regs = set_irq_regs(regs);
1075
1076 /*
1077 * NOTE! We'd better ACK the irq immediately,
1078 * because timer handling can be slow.
1079 */
1080 ack_APIC_irq();
1081 /*
1082 * update_process_times() expects us to have done irq_enter().
1083 * Besides, if we don't timer interrupts ignore the global
1084 * interrupt lock, which is the WrongThing (tm) to do.
1085 */
1086 exit_idle();
1087 irq_enter();
1088 smp_local_timer_interrupt();
1089 irq_exit();
1090 set_irq_regs(old_regs);
1091}
1092
1093/* 1170/*
1094 * apic_is_clustered_box() -- Check if we can expect good TSC 1171 * apic_is_clustered_box() -- Check if we can expect good TSC
1095 * 1172 *
@@ -1103,21 +1180,34 @@ __cpuinit int apic_is_clustered_box(void)
1103{ 1180{
1104 int i, clusters, zeros; 1181 int i, clusters, zeros;
1105 unsigned id; 1182 unsigned id;
1183 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
1106 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); 1184 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1107 1185
1108 bitmap_zero(clustermap, NUM_APIC_CLUSTERS); 1186 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1109 1187
1110 for (i = 0; i < NR_CPUS; i++) { 1188 for (i = 0; i < NR_CPUS; i++) {
1111 id = bios_cpu_apicid[i]; 1189 /* are we being called early in kernel startup? */
1190 if (bios_cpu_apicid) {
1191 id = bios_cpu_apicid[i];
1192 }
1193 else if (i < nr_cpu_ids) {
1194 if (cpu_present(i))
1195 id = per_cpu(x86_bios_cpu_apicid, i);
1196 else
1197 continue;
1198 }
1199 else
1200 break;
1201
1112 if (id != BAD_APICID) 1202 if (id != BAD_APICID)
1113 __set_bit(APIC_CLUSTERID(id), clustermap); 1203 __set_bit(APIC_CLUSTERID(id), clustermap);
1114 } 1204 }
1115 1205
1116 /* Problem: Partially populated chassis may not have CPUs in some of 1206 /* Problem: Partially populated chassis may not have CPUs in some of
1117 * the APIC clusters they have been allocated. Only present CPUs have 1207 * the APIC clusters they have been allocated. Only present CPUs have
1118 * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since 1208 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
1119 * clusters are allocated sequentially, count zeros only if they are 1209 * Since clusters are allocated sequentially, count zeros only if
1120 * bounded by ones. 1210 * they are bounded by ones.
1121 */ 1211 */
1122 clusters = 0; 1212 clusters = 0;
1123 zeros = 0; 1213 zeros = 0;
@@ -1138,96 +1228,33 @@ __cpuinit int apic_is_clustered_box(void)
1138} 1228}
1139 1229
1140/* 1230/*
1141 * This interrupt should _never_ happen with our APIC/SMP architecture 1231 * APIC command line parameters
1142 */
1143asmlinkage void smp_spurious_interrupt(void)
1144{
1145 unsigned int v;
1146 exit_idle();
1147 irq_enter();
1148 /*
1149 * Check if this really is a spurious interrupt and ACK it
1150 * if it is a vectored one. Just in case...
1151 * Spurious interrupts should not be ACKed.
1152 */
1153 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
1154 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1155 ack_APIC_irq();
1156
1157 add_pda(irq_spurious_count, 1);
1158 irq_exit();
1159}
1160
1161/*
1162 * This interrupt should never happen with our APIC/SMP architecture
1163 */ 1232 */
1164 1233static int __init apic_set_verbosity(char *str)
1165asmlinkage void smp_error_interrupt(void)
1166{
1167 unsigned int v, v1;
1168
1169 exit_idle();
1170 irq_enter();
1171 /* First tickle the hardware, only then report what went on. -- REW */
1172 v = apic_read(APIC_ESR);
1173 apic_write(APIC_ESR, 0);
1174 v1 = apic_read(APIC_ESR);
1175 ack_APIC_irq();
1176 atomic_inc(&irq_err_count);
1177
1178 /* Here is what the APIC error bits mean:
1179 0: Send CS error
1180 1: Receive CS error
1181 2: Send accept error
1182 3: Receive accept error
1183 4: Reserved
1184 5: Send illegal vector
1185 6: Received illegal vector
1186 7: Illegal register address
1187 */
1188 printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1189 smp_processor_id(), v , v1);
1190 irq_exit();
1191}
1192
1193int disable_apic;
1194
1195/*
1196 * This initializes the IO-APIC and APIC hardware if this is
1197 * a UP kernel.
1198 */
1199int __init APIC_init_uniprocessor (void)
1200{ 1234{
1201 if (disable_apic) { 1235 if (str == NULL) {
1202 printk(KERN_INFO "Apic disabled\n"); 1236 skip_ioapic_setup = 0;
1203 return -1; 1237 ioapic_force = 1;
1238 return 0;
1204 } 1239 }
1205 if (!cpu_has_apic) { 1240 if (strcmp("debug", str) == 0)
1206 disable_apic = 1; 1241 apic_verbosity = APIC_DEBUG;
1207 printk(KERN_INFO "Apic disabled by BIOS\n"); 1242 else if (strcmp("verbose", str) == 0)
1208 return -1; 1243 apic_verbosity = APIC_VERBOSE;
1244 else {
1245 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1246 " use apic=verbose or apic=debug\n", str);
1247 return -EINVAL;
1209 } 1248 }
1210 1249
1211 verify_local_APIC();
1212
1213 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
1214 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
1215
1216 setup_local_APIC();
1217
1218 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1219 setup_IO_APIC();
1220 else
1221 nr_ioapics = 0;
1222 setup_boot_APIC_clock();
1223 check_nmi_watchdog();
1224 return 0; 1250 return 0;
1225} 1251}
1252early_param("apic", apic_set_verbosity);
1226 1253
1227static __init int setup_disableapic(char *str) 1254static __init int setup_disableapic(char *str)
1228{ 1255{
1229 disable_apic = 1; 1256 disable_apic = 1;
1230 clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); 1257 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1231 return 0; 1258 return 0;
1232} 1259}
1233early_param("disableapic", setup_disableapic); 1260early_param("disableapic", setup_disableapic);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 17089a041028..d4438ef296d8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -227,6 +227,7 @@
227#include <linux/dmi.h> 227#include <linux/dmi.h>
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h>
230 231
231#include <asm/system.h> 232#include <asm/system.h>
232#include <asm/uaccess.h> 233#include <asm/uaccess.h>
@@ -235,8 +236,6 @@
235#include <asm/paravirt.h> 236#include <asm/paravirt.h>
236#include <asm/reboot.h> 237#include <asm/reboot.h>
237 238
238#include "io_ports.h"
239
240#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) 239#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT)
241extern int (*console_blank_hook)(int); 240extern int (*console_blank_hook)(int);
242#endif 241#endif
@@ -324,7 +323,7 @@ extern int (*console_blank_hook)(int);
324/* 323/*
325 * Ignore suspend events for this amount of time after a resume 324 * Ignore suspend events for this amount of time after a resume
326 */ 325 */
327#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) 326#define DEFAULT_BOUNCE_INTERVAL (3 * HZ)
328 327
329/* 328/*
330 * Maximum number of events stored 329 * Maximum number of events stored
@@ -336,7 +335,7 @@ extern int (*console_blank_hook)(int);
336 */ 335 */
337struct apm_user { 336struct apm_user {
338 int magic; 337 int magic;
339 struct apm_user * next; 338 struct apm_user *next;
340 unsigned int suser: 1; 339 unsigned int suser: 1;
341 unsigned int writer: 1; 340 unsigned int writer: 1;
342 unsigned int reader: 1; 341 unsigned int reader: 1;
@@ -372,44 +371,44 @@ struct apm_user {
372static struct { 371static struct {
373 unsigned long offset; 372 unsigned long offset;
374 unsigned short segment; 373 unsigned short segment;
375} apm_bios_entry; 374} apm_bios_entry;
376static int clock_slowed; 375static int clock_slowed;
377static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; 376static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD;
378static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; 377static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD;
379static int set_pm_idle; 378static int set_pm_idle;
380static int suspends_pending; 379static int suspends_pending;
381static int standbys_pending; 380static int standbys_pending;
382static int ignore_sys_suspend; 381static int ignore_sys_suspend;
383static int ignore_normal_resume; 382static int ignore_normal_resume;
384static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 383static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
385 384
386static int debug __read_mostly; 385static int debug __read_mostly;
387static int smp __read_mostly; 386static int smp __read_mostly;
388static int apm_disabled = -1; 387static int apm_disabled = -1;
389#ifdef CONFIG_SMP 388#ifdef CONFIG_SMP
390static int power_off; 389static int power_off;
391#else 390#else
392static int power_off = 1; 391static int power_off = 1;
393#endif 392#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF 393#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1; 394static int realmode_power_off = 1;
396#else 395#else
397static int realmode_power_off; 396static int realmode_power_off;
398#endif 397#endif
399#ifdef CONFIG_APM_ALLOW_INTS 398#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1; 399static int allow_ints = 1;
401#else 400#else
402static int allow_ints; 401static int allow_ints;
403#endif 402#endif
404static int broken_psr; 403static int broken_psr;
405 404
406static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); 405static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
407static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 406static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
408static struct apm_user * user_list; 407static struct apm_user *user_list;
409static DEFINE_SPINLOCK(user_list_lock); 408static DEFINE_SPINLOCK(user_list_lock);
410static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; 409static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } };
411 410
412static const char driver_version[] = "1.16ac"; /* no spaces */ 411static const char driver_version[] = "1.16ac"; /* no spaces */
413 412
414static struct task_struct *kapmd_task; 413static struct task_struct *kapmd_task;
415 414
@@ -417,7 +416,7 @@ static struct task_struct *kapmd_task;
417 * APM event names taken from the APM 1.2 specification. These are 416 * APM event names taken from the APM 1.2 specification. These are
418 * the message codes that the BIOS uses to tell us about events 417 * the message codes that the BIOS uses to tell us about events
419 */ 418 */
420static const char * const apm_event_name[] = { 419static const char * const apm_event_name[] = {
421 "system standby", 420 "system standby",
422 "system suspend", 421 "system suspend",
423 "normal resume", 422 "normal resume",
@@ -435,14 +434,14 @@ static const char * const apm_event_name[] = {
435 434
436typedef struct lookup_t { 435typedef struct lookup_t {
437 int key; 436 int key;
438 char * msg; 437 char *msg;
439} lookup_t; 438} lookup_t;
440 439
441/* 440/*
442 * The BIOS returns a set of standard error codes in AX when the 441 * The BIOS returns a set of standard error codes in AX when the
443 * carry flag is set. 442 * carry flag is set.
444 */ 443 */
445 444
446static const lookup_t error_table[] = { 445static const lookup_t error_table[] = {
447/* N/A { APM_SUCCESS, "Operation succeeded" }, */ 446/* N/A { APM_SUCCESS, "Operation succeeded" }, */
448 { APM_DISABLED, "Power management disabled" }, 447 { APM_DISABLED, "Power management disabled" },
@@ -472,24 +471,25 @@ static const lookup_t error_table[] = {
472 * Write a meaningful log entry to the kernel log in the event of 471 * Write a meaningful log entry to the kernel log in the event of
473 * an APM error. 472 * an APM error.
474 */ 473 */
475 474
476static void apm_error(char *str, int err) 475static void apm_error(char *str, int err)
477{ 476{
478 int i; 477 int i;
479 478
480 for (i = 0; i < ERROR_COUNT; i++) 479 for (i = 0; i < ERROR_COUNT; i++)
481 if (error_table[i].key == err) break; 480 if (error_table[i].key == err)
481 break;
482 if (i < ERROR_COUNT) 482 if (i < ERROR_COUNT)
483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); 483 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
484 else 484 else
485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", 485 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
486 str, err); 486 str, err);
487} 487}
488 488
489/* 489/*
490 * Lock APM functionality to physical CPU 0 490 * Lock APM functionality to physical CPU 0
491 */ 491 */
492 492
493#ifdef CONFIG_SMP 493#ifdef CONFIG_SMP
494 494
495static cpumask_t apm_save_cpus(void) 495static cpumask_t apm_save_cpus(void)
@@ -511,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask)
511/* 511/*
512 * No CPU lockdown needed on a uniprocessor 512 * No CPU lockdown needed on a uniprocessor
513 */ 513 */
514 514
515#define apm_save_cpus() (current->cpus_allowed) 515#define apm_save_cpus() (current->cpus_allowed)
516#define apm_restore_cpus(x) (void)(x) 516#define apm_restore_cpus(x) (void)(x)
517 517
@@ -590,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags)
590 * code is returned in AH (bits 8-15 of eax) and this function 590 * code is returned in AH (bits 8-15 of eax) and this function
591 * returns non-zero. 591 * returns non-zero.
592 */ 592 */
593 593
594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, 594static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) 595 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi)
596{ 596{
@@ -602,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
602 struct desc_struct *gdt; 602 struct desc_struct *gdt;
603 603
604 cpus = apm_save_cpus(); 604 cpus = apm_save_cpus();
605 605
606 cpu = get_cpu(); 606 cpu = get_cpu();
607 gdt = get_cpu_gdt_table(cpu); 607 gdt = get_cpu_gdt_table(cpu);
608 save_desc_40 = gdt[0x40 / 8]; 608 save_desc_40 = gdt[0x40 / 8];
@@ -616,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
616 gdt[0x40 / 8] = save_desc_40; 616 gdt[0x40 / 8] = save_desc_40;
617 put_cpu(); 617 put_cpu();
618 apm_restore_cpus(cpus); 618 apm_restore_cpus(cpus);
619 619
620 return *eax & 0xff; 620 return *eax & 0xff;
621} 621}
622 622
@@ -645,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
645 struct desc_struct *gdt; 645 struct desc_struct *gdt;
646 646
647 cpus = apm_save_cpus(); 647 cpus = apm_save_cpus();
648 648
649 cpu = get_cpu(); 649 cpu = get_cpu();
650 gdt = get_cpu_gdt_table(cpu); 650 gdt = get_cpu_gdt_table(cpu);
651 save_desc_40 = gdt[0x40 / 8]; 651 save_desc_40 = gdt[0x40 / 8];
@@ -680,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
680 680
681static int apm_driver_version(u_short *val) 681static int apm_driver_version(u_short *val)
682{ 682{
683 u32 eax; 683 u32 eax;
684 684
685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) 685 if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax))
686 return (eax >> 8) & 0xff; 686 return (eax >> 8) & 0xff;
@@ -704,16 +704,16 @@ static int apm_driver_version(u_short *val)
704 * that APM 1.2 is in use. If no messges are pending the value 0x80 704 * that APM 1.2 is in use. If no messges are pending the value 0x80
705 * is returned (No power management events pending). 705 * is returned (No power management events pending).
706 */ 706 */
707 707
708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) 708static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
709{ 709{
710 u32 eax; 710 u32 eax;
711 u32 ebx; 711 u32 ebx;
712 u32 ecx; 712 u32 ecx;
713 u32 dummy; 713 u32 dummy;
714 714
715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, 715 if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx,
716 &dummy, &dummy)) 716 &dummy, &dummy))
717 return (eax >> 8) & 0xff; 717 return (eax >> 8) & 0xff;
718 *event = ebx; 718 *event = ebx;
719 if (apm_info.connection_version < 0x0102) 719 if (apm_info.connection_version < 0x0102)
@@ -736,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
736 * The state holds the state to transition to, which may in fact 736 * The state holds the state to transition to, which may in fact
737 * be an acceptance of a BIOS requested state change. 737 * be an acceptance of a BIOS requested state change.
738 */ 738 */
739 739
740static int set_power_state(u_short what, u_short state) 740static int set_power_state(u_short what, u_short state)
741{ 741{
742 u32 eax; 742 u32 eax;
743 743
744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) 744 if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax))
745 return (eax >> 8) & 0xff; 745 return (eax >> 8) & 0xff;
@@ -752,7 +752,7 @@ static int set_power_state(u_short what, u_short state)
752 * 752 *
753 * Transition the entire system into a new APM power state. 753 * Transition the entire system into a new APM power state.
754 */ 754 */
755 755
756static int set_system_power_state(u_short state) 756static int set_system_power_state(u_short state)
757{ 757{
758 return set_power_state(APM_DEVICE_ALL, state); 758 return set_power_state(APM_DEVICE_ALL, state);
@@ -766,13 +766,13 @@ static int set_system_power_state(u_short state)
766 * to handle the idle request. On a success the function returns 1 766 * to handle the idle request. On a success the function returns 1
767 * if the BIOS did clock slowing or 0 otherwise. 767 * if the BIOS did clock slowing or 0 otherwise.
768 */ 768 */
769 769
770static int apm_do_idle(void) 770static int apm_do_idle(void)
771{ 771{
772 u32 eax; 772 u32 eax;
773 u8 ret = 0; 773 u8 ret = 0;
774 int idled = 0; 774 int idled = 0;
775 int polling; 775 int polling;
776 776
777 polling = !!(current_thread_info()->status & TS_POLLING); 777 polling = !!(current_thread_info()->status & TS_POLLING);
778 if (polling) { 778 if (polling) {
@@ -799,10 +799,9 @@ static int apm_do_idle(void)
799 /* This always fails on some SMP boards running UP kernels. 799 /* This always fails on some SMP boards running UP kernels.
800 * Only report the failure the first 5 times. 800 * Only report the failure the first 5 times.
801 */ 801 */
802 if (++t < 5) 802 if (++t < 5) {
803 {
804 printk(KERN_DEBUG "apm_do_idle failed (%d)\n", 803 printk(KERN_DEBUG "apm_do_idle failed (%d)\n",
805 (eax >> 8) & 0xff); 804 (eax >> 8) & 0xff);
806 t = jiffies; 805 t = jiffies;
807 } 806 }
808 return -1; 807 return -1;
@@ -814,15 +813,15 @@ static int apm_do_idle(void)
814/** 813/**
815 * apm_do_busy - inform the BIOS the CPU is busy 814 * apm_do_busy - inform the BIOS the CPU is busy
816 * 815 *
817 * Request that the BIOS brings the CPU back to full performance. 816 * Request that the BIOS brings the CPU back to full performance.
818 */ 817 */
819 818
820static void apm_do_busy(void) 819static void apm_do_busy(void)
821{ 820{
822 u32 dummy; 821 u32 dummy;
823 822
824 if (clock_slowed || ALWAYS_CALL_BUSY) { 823 if (clock_slowed || ALWAYS_CALL_BUSY) {
825 (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); 824 (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy);
826 clock_slowed = 0; 825 clock_slowed = 0;
827 } 826 }
828} 827}
@@ -833,15 +832,15 @@ static void apm_do_busy(void)
833 * power management - we probably want 832 * power management - we probably want
834 * to conserve power. 833 * to conserve power.
835 */ 834 */
836#define IDLE_CALC_LIMIT (HZ * 100) 835#define IDLE_CALC_LIMIT (HZ * 100)
837#define IDLE_LEAKY_MAX 16 836#define IDLE_LEAKY_MAX 16
838 837
839static void (*original_pm_idle)(void) __read_mostly; 838static void (*original_pm_idle)(void) __read_mostly;
840 839
841/** 840/**
842 * apm_cpu_idle - cpu idling for APM capable Linux 841 * apm_cpu_idle - cpu idling for APM capable Linux
843 * 842 *
844 * This is the idling function the kernel executes when APM is available. It 843 * This is the idling function the kernel executes when APM is available. It
845 * tries to do BIOS powermanagement based on the average system idle time. 844 * tries to do BIOS powermanagement based on the average system idle time.
846 * Furthermore it calls the system default idle routine. 845 * Furthermore it calls the system default idle routine.
847 */ 846 */
@@ -882,7 +881,8 @@ recalc:
882 881
883 t = jiffies; 882 t = jiffies;
884 switch (apm_do_idle()) { 883 switch (apm_do_idle()) {
885 case 0: apm_idle_done = 1; 884 case 0:
885 apm_idle_done = 1;
886 if (t != jiffies) { 886 if (t != jiffies) {
887 if (bucket) { 887 if (bucket) {
888 bucket = IDLE_LEAKY_MAX; 888 bucket = IDLE_LEAKY_MAX;
@@ -893,7 +893,8 @@ recalc:
893 continue; 893 continue;
894 } 894 }
895 break; 895 break;
896 case 1: apm_idle_done = 1; 896 case 1:
897 apm_idle_done = 1;
897 break; 898 break;
898 default: /* BIOS refused */ 899 default: /* BIOS refused */
899 break; 900 break;
@@ -921,10 +922,10 @@ recalc:
921 * the SMP call on CPU0 as some systems will only honour this call 922 * the SMP call on CPU0 as some systems will only honour this call
922 * on their first cpu. 923 * on their first cpu.
923 */ 924 */
924 925
925static void apm_power_off(void) 926static void apm_power_off(void)
926{ 927{
927 unsigned char po_bios_call[] = { 928 unsigned char po_bios_call[] = {
928 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ 929 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
929 0x8e, 0xd0, /* movw ax,ss */ 930 0x8e, 0xd0, /* movw ax,ss */
930 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ 931 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
@@ -935,13 +936,12 @@ static void apm_power_off(void)
935 }; 936 };
936 937
937 /* Some bioses don't like being called from CPU != 0 */ 938 /* Some bioses don't like being called from CPU != 0 */
938 if (apm_info.realmode_power_off) 939 if (apm_info.realmode_power_off) {
939 {
940 (void)apm_save_cpus(); 940 (void)apm_save_cpus();
941 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 941 machine_real_restart(po_bios_call, sizeof(po_bios_call));
942 } else {
943 (void)set_system_power_state(APM_STATE_OFF);
942 } 944 }
943 else
944 (void) set_system_power_state(APM_STATE_OFF);
945} 945}
946 946
947#ifdef CONFIG_APM_DO_ENABLE 947#ifdef CONFIG_APM_DO_ENABLE
@@ -950,17 +950,17 @@ static void apm_power_off(void)
950 * apm_enable_power_management - enable BIOS APM power management 950 * apm_enable_power_management - enable BIOS APM power management
951 * @enable: enable yes/no 951 * @enable: enable yes/no
952 * 952 *
953 * Enable or disable the APM BIOS power services. 953 * Enable or disable the APM BIOS power services.
954 */ 954 */
955 955
956static int apm_enable_power_management(int enable) 956static int apm_enable_power_management(int enable)
957{ 957{
958 u32 eax; 958 u32 eax;
959 959
960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) 960 if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED))
961 return APM_NOT_ENGAGED; 961 return APM_NOT_ENGAGED;
962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, 962 if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL,
963 enable, &eax)) 963 enable, &eax))
964 return (eax >> 8) & 0xff; 964 return (eax >> 8) & 0xff;
965 if (enable) 965 if (enable)
966 apm_info.bios.flags &= ~APM_BIOS_DISABLED; 966 apm_info.bios.flags &= ~APM_BIOS_DISABLED;
@@ -983,19 +983,19 @@ static int apm_enable_power_management(int enable)
983 * if reported is a lifetime in secodnds/minutes at current powwer 983 * if reported is a lifetime in secodnds/minutes at current powwer
984 * consumption. 984 * consumption.
985 */ 985 */
986 986
987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) 987static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
988{ 988{
989 u32 eax; 989 u32 eax;
990 u32 ebx; 990 u32 ebx;
991 u32 ecx; 991 u32 ecx;
992 u32 edx; 992 u32 edx;
993 u32 dummy; 993 u32 dummy;
994 994
995 if (apm_info.get_power_status_broken) 995 if (apm_info.get_power_status_broken)
996 return APM_32_UNSUPPORTED; 996 return APM_32_UNSUPPORTED;
997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, 997 if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0,
998 &eax, &ebx, &ecx, &edx, &dummy)) 998 &eax, &ebx, &ecx, &edx, &dummy))
999 return (eax >> 8) & 0xff; 999 return (eax >> 8) & 0xff;
1000 *status = ebx; 1000 *status = ebx;
1001 *bat = ecx; 1001 *bat = ecx;
@@ -1011,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life)
1011static int apm_get_battery_status(u_short which, u_short *status, 1011static int apm_get_battery_status(u_short which, u_short *status,
1012 u_short *bat, u_short *life, u_short *nbat) 1012 u_short *bat, u_short *life, u_short *nbat)
1013{ 1013{
1014 u32 eax; 1014 u32 eax;
1015 u32 ebx; 1015 u32 ebx;
1016 u32 ecx; 1016 u32 ecx;
1017 u32 edx; 1017 u32 edx;
1018 u32 esi; 1018 u32 esi;
1019 1019
1020 if (apm_info.connection_version < 0x0102) { 1020 if (apm_info.connection_version < 0x0102) {
1021 /* pretend we only have one battery. */ 1021 /* pretend we only have one battery. */
@@ -1026,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
1026 } 1026 }
1027 1027
1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, 1028 if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax,
1029 &ebx, &ecx, &edx, &esi)) 1029 &ebx, &ecx, &edx, &esi))
1030 return (eax >> 8) & 0xff; 1030 return (eax >> 8) & 0xff;
1031 *status = ebx; 1031 *status = ebx;
1032 *bat = ecx; 1032 *bat = ecx;
@@ -1044,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status,
1044 * Activate or deactive power management on either a specific device 1044 * Activate or deactive power management on either a specific device
1045 * or the entire system (%APM_DEVICE_ALL). 1045 * or the entire system (%APM_DEVICE_ALL).
1046 */ 1046 */
1047 1047
1048static int apm_engage_power_management(u_short device, int enable) 1048static int apm_engage_power_management(u_short device, int enable)
1049{ 1049{
1050 u32 eax; 1050 u32 eax;
1051 1051
1052 if ((enable == 0) && (device == APM_DEVICE_ALL) 1052 if ((enable == 0) && (device == APM_DEVICE_ALL)
1053 && (apm_info.bios.flags & APM_BIOS_DISABLED)) 1053 && (apm_info.bios.flags & APM_BIOS_DISABLED))
@@ -1074,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable)
1074 * all video devices. Typically the BIOS will do laptop backlight and 1074 * all video devices. Typically the BIOS will do laptop backlight and
1075 * monitor powerdown for us. 1075 * monitor powerdown for us.
1076 */ 1076 */
1077 1077
1078static int apm_console_blank(int blank) 1078static int apm_console_blank(int blank)
1079{ 1079{
1080 int error = APM_NOT_ENGAGED; /* silence gcc */ 1080 int error = APM_NOT_ENGAGED; /* silence gcc */
@@ -1126,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as)
1126 1126
1127static void queue_event(apm_event_t event, struct apm_user *sender) 1127static void queue_event(apm_event_t event, struct apm_user *sender)
1128{ 1128{
1129 struct apm_user * as; 1129 struct apm_user *as;
1130 1130
1131 spin_lock(&user_list_lock); 1131 spin_lock(&user_list_lock);
1132 if (user_list == NULL) 1132 if (user_list == NULL)
@@ -1174,11 +1174,11 @@ static void reinit_timer(void)
1174 1174
1175 spin_lock_irqsave(&i8253_lock, flags); 1175 spin_lock_irqsave(&i8253_lock, flags);
1176 /* set the clock to HZ */ 1176 /* set the clock to HZ */
1177 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ 1177 outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
1178 udelay(10); 1178 udelay(10);
1179 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ 1179 outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */
1180 udelay(10); 1180 udelay(10);
1181 outb(LATCH >> 8, PIT_CH0); /* MSB */ 1181 outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
1182 udelay(10); 1182 udelay(10);
1183 spin_unlock_irqrestore(&i8253_lock, flags); 1183 spin_unlock_irqrestore(&i8253_lock, flags);
1184#endif 1184#endif
@@ -1186,7 +1186,7 @@ static void reinit_timer(void)
1186 1186
1187static int suspend(int vetoable) 1187static int suspend(int vetoable)
1188{ 1188{
1189 int err; 1189 int err;
1190 struct apm_user *as; 1190 struct apm_user *as;
1191 1191
1192 if (pm_send_all(PM_SUSPEND, (void *)3)) { 1192 if (pm_send_all(PM_SUSPEND, (void *)3)) {
@@ -1239,7 +1239,7 @@ static int suspend(int vetoable)
1239 1239
1240static void standby(void) 1240static void standby(void)
1241{ 1241{
1242 int err; 1242 int err;
1243 1243
1244 local_irq_disable(); 1244 local_irq_disable();
1245 device_power_down(PMSG_SUSPEND); 1245 device_power_down(PMSG_SUSPEND);
@@ -1256,8 +1256,8 @@ static void standby(void)
1256 1256
1257static apm_event_t get_event(void) 1257static apm_event_t get_event(void)
1258{ 1258{
1259 int error; 1259 int error;
1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */ 1260 apm_event_t event = APM_NO_EVENTS; /* silence gcc */
1261 apm_eventinfo_t info; 1261 apm_eventinfo_t info;
1262 1262
1263 static int notified; 1263 static int notified;
@@ -1275,9 +1275,9 @@ static apm_event_t get_event(void)
1275 1275
1276static void check_events(void) 1276static void check_events(void)
1277{ 1277{
1278 apm_event_t event; 1278 apm_event_t event;
1279 static unsigned long last_resume; 1279 static unsigned long last_resume;
1280 static int ignore_bounce; 1280 static int ignore_bounce;
1281 1281
1282 while ((event = get_event()) != 0) { 1282 while ((event = get_event()) != 0) {
1283 if (debug) { 1283 if (debug) {
@@ -1289,7 +1289,7 @@ static void check_events(void)
1289 "event 0x%02x\n", event); 1289 "event 0x%02x\n", event);
1290 } 1290 }
1291 if (ignore_bounce 1291 if (ignore_bounce
1292 && ((jiffies - last_resume) > bounce_interval)) 1292 && (time_after(jiffies, last_resume + bounce_interval)))
1293 ignore_bounce = 0; 1293 ignore_bounce = 0;
1294 1294
1295 switch (event) { 1295 switch (event) {
@@ -1357,7 +1357,7 @@ static void check_events(void)
1357 /* 1357 /*
1358 * We are not allowed to reject a critical suspend. 1358 * We are not allowed to reject a critical suspend.
1359 */ 1359 */
1360 (void) suspend(0); 1360 (void)suspend(0);
1361 break; 1361 break;
1362 } 1362 }
1363 } 1363 }
@@ -1365,12 +1365,12 @@ static void check_events(void)
1365 1365
1366static void apm_event_handler(void) 1366static void apm_event_handler(void)
1367{ 1367{
1368 static int pending_count = 4; 1368 static int pending_count = 4;
1369 int err; 1369 int err;
1370 1370
1371 if ((standbys_pending > 0) || (suspends_pending > 0)) { 1371 if ((standbys_pending > 0) || (suspends_pending > 0)) {
1372 if ((apm_info.connection_version > 0x100) && 1372 if ((apm_info.connection_version > 0x100) &&
1373 (pending_count-- <= 0)) { 1373 (pending_count-- <= 0)) {
1374 pending_count = 4; 1374 pending_count = 4;
1375 if (debug) 1375 if (debug)
1376 printk(KERN_DEBUG "apm: setting state busy\n"); 1376 printk(KERN_DEBUG "apm: setting state busy\n");
@@ -1418,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func)
1418 1418
1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) 1419static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos)
1420{ 1420{
1421 struct apm_user * as; 1421 struct apm_user *as;
1422 int i; 1422 int i;
1423 apm_event_t event; 1423 apm_event_t event;
1424 1424
1425 as = fp->private_data; 1425 as = fp->private_data;
1426 if (check_apm_user(as, "read")) 1426 if (check_apm_user(as, "read"))
@@ -1459,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *
1459 return 0; 1459 return 0;
1460} 1460}
1461 1461
1462static unsigned int do_poll(struct file *fp, poll_table * wait) 1462static unsigned int do_poll(struct file *fp, poll_table *wait)
1463{ 1463{
1464 struct apm_user * as; 1464 struct apm_user *as;
1465 1465
1466 as = fp->private_data; 1466 as = fp->private_data;
1467 if (check_apm_user(as, "poll")) 1467 if (check_apm_user(as, "poll"))
@@ -1472,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait)
1472 return 0; 1472 return 0;
1473} 1473}
1474 1474
1475static int do_ioctl(struct inode * inode, struct file *filp, 1475static int do_ioctl(struct inode *inode, struct file *filp,
1476 u_int cmd, u_long arg) 1476 u_int cmd, u_long arg)
1477{ 1477{
1478 struct apm_user * as; 1478 struct apm_user *as;
1479 1479
1480 as = filp->private_data; 1480 as = filp->private_data;
1481 if (check_apm_user(as, "ioctl")) 1481 if (check_apm_user(as, "ioctl"))
@@ -1515,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp,
1515 return 0; 1515 return 0;
1516} 1516}
1517 1517
1518static int do_release(struct inode * inode, struct file * filp) 1518static int do_release(struct inode *inode, struct file *filp)
1519{ 1519{
1520 struct apm_user * as; 1520 struct apm_user *as;
1521 1521
1522 as = filp->private_data; 1522 as = filp->private_data;
1523 if (check_apm_user(as, "release")) 1523 if (check_apm_user(as, "release"))
@@ -1533,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp)
1533 if (suspends_pending <= 0) 1533 if (suspends_pending <= 0)
1534 (void) suspend(1); 1534 (void) suspend(1);
1535 } 1535 }
1536 spin_lock(&user_list_lock); 1536 spin_lock(&user_list_lock);
1537 if (user_list == as) 1537 if (user_list == as)
1538 user_list = as->next; 1538 user_list = as->next;
1539 else { 1539 else {
1540 struct apm_user * as1; 1540 struct apm_user *as1;
1541 1541
1542 for (as1 = user_list; 1542 for (as1 = user_list;
1543 (as1 != NULL) && (as1->next != as); 1543 (as1 != NULL) && (as1->next != as);
@@ -1553,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp)
1553 return 0; 1553 return 0;
1554} 1554}
1555 1555
1556static int do_open(struct inode * inode, struct file * filp) 1556static int do_open(struct inode *inode, struct file *filp)
1557{ 1557{
1558 struct apm_user * as; 1558 struct apm_user *as;
1559 1559
1560 as = kmalloc(sizeof(*as), GFP_KERNEL); 1560 as = kmalloc(sizeof(*as), GFP_KERNEL);
1561 if (as == NULL) { 1561 if (as == NULL) {
@@ -1569,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp)
1569 as->suspends_read = as->standbys_read = 0; 1569 as->suspends_read = as->standbys_read = 0;
1570 /* 1570 /*
1571 * XXX - this is a tiny bit broken, when we consider BSD 1571 * XXX - this is a tiny bit broken, when we consider BSD
1572 * process accounting. If the device is opened by root, we 1572 * process accounting. If the device is opened by root, we
1573 * instantly flag that we used superuser privs. Who knows, 1573 * instantly flag that we used superuser privs. Who knows,
1574 * we might close the device immediately without doing a 1574 * we might close the device immediately without doing a
1575 * privileged operation -- cevans 1575 * privileged operation -- cevans
@@ -1652,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v)
1652 8) min = minutes; sec = seconds */ 1652 8) min = minutes; sec = seconds */
1653 1653
1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", 1654 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1655 driver_version, 1655 driver_version,
1656 (apm_info.bios.version >> 8) & 0xff, 1656 (apm_info.bios.version >> 8) & 0xff,
1657 apm_info.bios.version & 0xff, 1657 apm_info.bios.version & 0xff,
1658 apm_info.bios.flags, 1658 apm_info.bios.flags,
1659 ac_line_status, 1659 ac_line_status,
1660 battery_status, 1660 battery_status,
1661 battery_flag, 1661 battery_flag,
1662 percentage, 1662 percentage,
1663 time_units, 1663 time_units,
1664 units); 1664 units);
1665 return 0; 1665 return 0;
1666} 1666}
1667 1667
@@ -1684,8 +1684,8 @@ static int apm(void *unused)
1684 unsigned short cx; 1684 unsigned short cx;
1685 unsigned short dx; 1685 unsigned short dx;
1686 int error; 1686 int error;
1687 char * power_stat; 1687 char *power_stat;
1688 char * bat_stat; 1688 char *bat_stat;
1689 1689
1690#ifdef CONFIG_SMP 1690#ifdef CONFIG_SMP
1691 /* 2002/08/01 - WT 1691 /* 2002/08/01 - WT
@@ -1744,23 +1744,41 @@ static int apm(void *unused)
1744 } 1744 }
1745 } 1745 }
1746 1746
1747 if (debug && (num_online_cpus() == 1 || smp )) { 1747 if (debug && (num_online_cpus() == 1 || smp)) {
1748 error = apm_get_power_status(&bx, &cx, &dx); 1748 error = apm_get_power_status(&bx, &cx, &dx);
1749 if (error) 1749 if (error)
1750 printk(KERN_INFO "apm: power status not available\n"); 1750 printk(KERN_INFO "apm: power status not available\n");
1751 else { 1751 else {
1752 switch ((bx >> 8) & 0xff) { 1752 switch ((bx >> 8) & 0xff) {
1753 case 0: power_stat = "off line"; break; 1753 case 0:
1754 case 1: power_stat = "on line"; break; 1754 power_stat = "off line";
1755 case 2: power_stat = "on backup power"; break; 1755 break;
1756 default: power_stat = "unknown"; break; 1756 case 1:
1757 power_stat = "on line";
1758 break;
1759 case 2:
1760 power_stat = "on backup power";
1761 break;
1762 default:
1763 power_stat = "unknown";
1764 break;
1757 } 1765 }
1758 switch (bx & 0xff) { 1766 switch (bx & 0xff) {
1759 case 0: bat_stat = "high"; break; 1767 case 0:
1760 case 1: bat_stat = "low"; break; 1768 bat_stat = "high";
1761 case 2: bat_stat = "critical"; break; 1769 break;
1762 case 3: bat_stat = "charging"; break; 1770 case 1:
1763 default: bat_stat = "unknown"; break; 1771 bat_stat = "low";
1772 break;
1773 case 2:
1774 bat_stat = "critical";
1775 break;
1776 case 3:
1777 bat_stat = "charging";
1778 break;
1779 default:
1780 bat_stat = "unknown";
1781 break;
1764 } 1782 }
1765 printk(KERN_INFO 1783 printk(KERN_INFO
1766 "apm: AC %s, battery status %s, battery life ", 1784 "apm: AC %s, battery status %s, battery life ",
@@ -1777,8 +1795,8 @@ static int apm(void *unused)
1777 printk("unknown\n"); 1795 printk("unknown\n");
1778 else 1796 else
1779 printk("%d %s\n", dx & 0x7fff, 1797 printk("%d %s\n", dx & 0x7fff,
1780 (dx & 0x8000) ? 1798 (dx & 0x8000) ?
1781 "minutes" : "seconds"); 1799 "minutes" : "seconds");
1782 } 1800 }
1783 } 1801 }
1784 } 1802 }
@@ -1803,7 +1821,7 @@ static int apm(void *unused)
1803#ifndef MODULE 1821#ifndef MODULE
1804static int __init apm_setup(char *str) 1822static int __init apm_setup(char *str)
1805{ 1823{
1806 int invert; 1824 int invert;
1807 1825
1808 while ((str != NULL) && (*str != '\0')) { 1826 while ((str != NULL) && (*str != '\0')) {
1809 if (strncmp(str, "off", 3) == 0) 1827 if (strncmp(str, "off", 3) == 0)
@@ -1828,14 +1846,13 @@ static int __init apm_setup(char *str)
1828 if ((strncmp(str, "power-off", 9) == 0) || 1846 if ((strncmp(str, "power-off", 9) == 0) ||
1829 (strncmp(str, "power_off", 9) == 0)) 1847 (strncmp(str, "power_off", 9) == 0))
1830 power_off = !invert; 1848 power_off = !invert;
1831 if (strncmp(str, "smp", 3) == 0) 1849 if (strncmp(str, "smp", 3) == 0) {
1832 {
1833 smp = !invert; 1850 smp = !invert;
1834 idle_threshold = 100; 1851 idle_threshold = 100;
1835 } 1852 }
1836 if ((strncmp(str, "allow-ints", 10) == 0) || 1853 if ((strncmp(str, "allow-ints", 10) == 0) ||
1837 (strncmp(str, "allow_ints", 10) == 0)) 1854 (strncmp(str, "allow_ints", 10) == 0))
1838 apm_info.allow_ints = !invert; 1855 apm_info.allow_ints = !invert;
1839 if ((strncmp(str, "broken-psr", 10) == 0) || 1856 if ((strncmp(str, "broken-psr", 10) == 0) ||
1840 (strncmp(str, "broken_psr", 10) == 0)) 1857 (strncmp(str, "broken_psr", 10) == 0))
1841 apm_info.get_power_status_broken = !invert; 1858 apm_info.get_power_status_broken = !invert;
@@ -1881,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d)
1881 */ 1898 */
1882static int __init broken_ps2_resume(const struct dmi_system_id *d) 1899static int __init broken_ps2_resume(const struct dmi_system_id *d)
1883{ 1900{
1884 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); 1901 printk(KERN_INFO "%s machine detected. Mousepad Resume Bug "
1902 "workaround hopefully not needed.\n", d->ident);
1885 return 0; 1903 return 0;
1886} 1904}
1887 1905
@@ -1890,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d)
1890{ 1908{
1891 if (apm_info.realmode_power_off == 0) { 1909 if (apm_info.realmode_power_off == 0) {
1892 apm_info.realmode_power_off = 1; 1910 apm_info.realmode_power_off = 1;
1893 printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); 1911 printk(KERN_INFO "%s bios detected. "
1912 "Using realmode poweroff only.\n", d->ident);
1894 } 1913 }
1895 return 0; 1914 return 0;
1896} 1915}
@@ -1900,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d)
1900{ 1919{
1901 if (apm_info.allow_ints == 0) { 1920 if (apm_info.allow_ints == 0) {
1902 apm_info.allow_ints = 1; 1921 apm_info.allow_ints = 1;
1903 printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); 1922 printk(KERN_INFO "%s machine detected. "
1923 "Enabling interrupts during APM calls.\n", d->ident);
1904 } 1924 }
1905 return 0; 1925 return 0;
1906} 1926}
@@ -1910,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d)
1910{ 1930{
1911 if (apm_info.disabled == 0) { 1931 if (apm_info.disabled == 0) {
1912 apm_info.disabled = 1; 1932 apm_info.disabled = 1;
1913 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); 1933 printk(KERN_INFO "%s machine detected. "
1934 "Disabling APM.\n", d->ident);
1914 } 1935 }
1915 return 0; 1936 return 0;
1916} 1937}
@@ -1919,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1919{ 1940{
1920 if (apm_info.disabled == 0) { 1941 if (apm_info.disabled == 0) {
1921 apm_info.disabled = 1; 1942 apm_info.disabled = 1;
1922 printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); 1943 printk(KERN_INFO "%s machine detected. "
1944 "Disabling APM.\n", d->ident);
1923 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1945 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n");
1924 printk(KERN_INFO "download from support.intel.com \n"); 1946 printk(KERN_INFO "download from support.intel.com \n");
1925 } 1947 }
@@ -1931,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
1931{ 1953{
1932 if (apm_info.forbid_idle == 0) { 1954 if (apm_info.forbid_idle == 0) {
1933 apm_info.forbid_idle = 1; 1955 apm_info.forbid_idle = 1;
1934 printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); 1956 printk(KERN_INFO "%s machine detected. "
1957 "Disabling APM idle calls.\n", d->ident);
1935 } 1958 }
1936 return 0; 1959 return 0;
1937} 1960}
@@ -1954,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d)
1954static int __init broken_apm_power(const struct dmi_system_id *d) 1977static int __init broken_apm_power(const struct dmi_system_id *d)
1955{ 1978{
1956 apm_info.get_power_status_broken = 1; 1979 apm_info.get_power_status_broken = 1;
1957 printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); 1980 printk(KERN_WARNING "BIOS strings suggest APM bugs, "
1981 "disabling power status reporting.\n");
1958 return 0; 1982 return 0;
1959} 1983}
1960 1984
@@ -1965,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d)
1965static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) 1989static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d)
1966{ 1990{
1967 apm_info.get_power_status_swabinminutes = 1; 1991 apm_info.get_power_status_swabinminutes = 1;
1968 printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); 1992 printk(KERN_WARNING "BIOS strings suggest APM reports battery life "
1993 "in minutes and wrong byte order.\n");
1969 return 0; 1994 return 0;
1970} 1995}
1971 1996
@@ -1990,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
1990 apm_is_horked, "Dell Inspiron 2500", 2015 apm_is_horked, "Dell Inspiron 2500",
1991 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2016 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
1992 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), 2017 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
1993 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), 2018 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
1994 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2019 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
1995 }, 2020 },
1996 { /* Allow interrupts during suspend on Dell Inspiron laptops*/ 2021 { /* Allow interrupts during suspend on Dell Inspiron laptops*/
1997 set_apm_ints, "Dell Inspiron", { 2022 set_apm_ints, "Dell Inspiron", {
@@ -2014,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
2014 apm_is_horked, "Dell Dimension 4100", 2039 apm_is_horked, "Dell Dimension 4100",
2015 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2040 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2016 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), 2041 DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"),
2017 DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), 2042 DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."),
2018 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2043 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
2019 }, 2044 },
2020 { /* Allow interrupts during suspend on Compaq Laptops*/ 2045 { /* Allow interrupts during suspend on Compaq Laptops*/
2021 set_apm_ints, "Compaq 12XL125", 2046 set_apm_ints, "Compaq 12XL125",
2022 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), 2047 { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
2023 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), 2048 DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"),
2024 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), 2049 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2025 DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, 2050 DMI_MATCH(DMI_BIOS_VERSION, "4.06"), },
2026 }, 2051 },
2027 { /* Allow interrupts during APM or the clock goes slow */ 2052 { /* Allow interrupts during APM or the clock goes slow */
2028 set_apm_ints, "ASUSTeK", 2053 set_apm_ints, "ASUSTeK",
@@ -2064,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
2064 apm_is_horked, "Sharp PC-PJ/AX", 2089 apm_is_horked, "Sharp PC-PJ/AX",
2065 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), 2090 { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"),
2066 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), 2091 DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"),
2067 DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), 2092 DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"),
2068 DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, 2093 DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), },
2069 }, 2094 },
2070 { /* APM crashes */ 2095 { /* APM crashes */
2071 apm_is_horked, "Dell Inspiron 2500", 2096 apm_is_horked, "Dell Inspiron 2500",
2072 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), 2097 { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
2073 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), 2098 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"),
2074 DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), 2099 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"),
2075 DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, 2100 DMI_MATCH(DMI_BIOS_VERSION, "A11"), },
2076 }, 2101 },
2077 { /* APM idle hangs */ 2102 { /* APM idle hangs */
2078 apm_likes_to_melt, "Jabil AMD", 2103 apm_likes_to_melt, "Jabil AMD",
@@ -2203,11 +2228,11 @@ static int __init apm_init(void)
2203 return -ENODEV; 2228 return -ENODEV;
2204 } 2229 }
2205 printk(KERN_INFO 2230 printk(KERN_INFO
2206 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", 2231 "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n",
2207 ((apm_info.bios.version >> 8) & 0xff), 2232 ((apm_info.bios.version >> 8) & 0xff),
2208 (apm_info.bios.version & 0xff), 2233 (apm_info.bios.version & 0xff),
2209 apm_info.bios.flags, 2234 apm_info.bios.flags,
2210 driver_version); 2235 driver_version);
2211 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { 2236 if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) {
2212 printk(KERN_INFO "apm: no 32 bit BIOS support\n"); 2237 printk(KERN_INFO "apm: no 32 bit BIOS support\n");
2213 return -ENODEV; 2238 return -ENODEV;
@@ -2256,14 +2281,12 @@ static int __init apm_init(void)
2256 apm_info.disabled = 1; 2281 apm_info.disabled = 1;
2257 return -ENODEV; 2282 return -ENODEV;
2258 } 2283 }
2259 if (PM_IS_ACTIVE()) { 2284 if (pm_flags & PM_ACPI) {
2260 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2285 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2261 apm_info.disabled = 1; 2286 apm_info.disabled = 1;
2262 return -ENODEV; 2287 return -ENODEV;
2263 } 2288 }
2264#ifdef CONFIG_PM_LEGACY 2289 pm_flags |= PM_APM;
2265 pm_active = 1;
2266#endif
2267 2290
2268 /* 2291 /*
2269 * Set up a segment that references the real mode segment 0x40 2292 * Set up a segment that references the real mode segment 0x40
@@ -2314,9 +2337,9 @@ static int __init apm_init(void)
2314 } 2337 }
2315 wake_up_process(kapmd_task); 2338 wake_up_process(kapmd_task);
2316 2339
2317 if (num_online_cpus() > 1 && !smp ) { 2340 if (num_online_cpus() > 1 && !smp) {
2318 printk(KERN_NOTICE 2341 printk(KERN_NOTICE
2319 "apm: disabled - APM is not SMP safe (power off active).\n"); 2342 "apm: disabled - APM is not SMP safe (power off active).\n");
2320 return 0; 2343 return 0;
2321 } 2344 }
2322 2345
@@ -2341,7 +2364,7 @@ static int __init apm_init(void)
2341 2364
2342static void __exit apm_exit(void) 2365static void __exit apm_exit(void)
2343{ 2366{
2344 int error; 2367 int error;
2345 2368
2346 if (set_pm_idle) { 2369 if (set_pm_idle) {
2347 pm_idle = original_pm_idle; 2370 pm_idle = original_pm_idle;
@@ -2366,9 +2389,7 @@ static void __exit apm_exit(void)
2366 kthread_stop(kapmd_task); 2389 kthread_stop(kapmd_task);
2367 kapmd_task = NULL; 2390 kapmd_task = NULL;
2368 } 2391 }
2369#ifdef CONFIG_PM_LEGACY 2392 pm_flags &= ~PM_APM;
2370 pm_active = 0;
2371#endif
2372} 2393}
2373 2394
2374module_init(apm_init); 2395module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 0e45981b2dd7..afd84463b712 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -38,15 +38,15 @@ void foo(void);
38 38
39void foo(void) 39void foo(void)
40{ 40{
41 OFFSET(SIGCONTEXT_eax, sigcontext, eax); 41 OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
42 OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); 42 OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
43 OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); 43 OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
44 OFFSET(SIGCONTEXT_edx, sigcontext, edx); 44 OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
45 OFFSET(SIGCONTEXT_esi, sigcontext, esi); 45 OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
46 OFFSET(SIGCONTEXT_edi, sigcontext, edi); 46 OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
47 OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); 47 OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
48 OFFSET(SIGCONTEXT_esp, sigcontext, esp); 48 OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
49 OFFSET(SIGCONTEXT_eip, sigcontext, eip); 49 OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
50 BLANK(); 50 BLANK();
51 51
52 OFFSET(CPUINFO_x86, cpuinfo_x86, x86); 52 OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
@@ -70,39 +70,38 @@ void foo(void)
70 OFFSET(TI_cpu, thread_info, cpu); 70 OFFSET(TI_cpu, thread_info, cpu);
71 BLANK(); 71 BLANK();
72 72
73 OFFSET(GDS_size, Xgt_desc_struct, size); 73 OFFSET(GDS_size, desc_ptr, size);
74 OFFSET(GDS_address, Xgt_desc_struct, address); 74 OFFSET(GDS_address, desc_ptr, address);
75 OFFSET(GDS_pad, Xgt_desc_struct, pad);
76 BLANK(); 75 BLANK();
77 76
78 OFFSET(PT_EBX, pt_regs, ebx); 77 OFFSET(PT_EBX, pt_regs, bx);
79 OFFSET(PT_ECX, pt_regs, ecx); 78 OFFSET(PT_ECX, pt_regs, cx);
80 OFFSET(PT_EDX, pt_regs, edx); 79 OFFSET(PT_EDX, pt_regs, dx);
81 OFFSET(PT_ESI, pt_regs, esi); 80 OFFSET(PT_ESI, pt_regs, si);
82 OFFSET(PT_EDI, pt_regs, edi); 81 OFFSET(PT_EDI, pt_regs, di);
83 OFFSET(PT_EBP, pt_regs, ebp); 82 OFFSET(PT_EBP, pt_regs, bp);
84 OFFSET(PT_EAX, pt_regs, eax); 83 OFFSET(PT_EAX, pt_regs, ax);
85 OFFSET(PT_DS, pt_regs, xds); 84 OFFSET(PT_DS, pt_regs, ds);
86 OFFSET(PT_ES, pt_regs, xes); 85 OFFSET(PT_ES, pt_regs, es);
87 OFFSET(PT_FS, pt_regs, xfs); 86 OFFSET(PT_FS, pt_regs, fs);
88 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); 87 OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
89 OFFSET(PT_EIP, pt_regs, eip); 88 OFFSET(PT_EIP, pt_regs, ip);
90 OFFSET(PT_CS, pt_regs, xcs); 89 OFFSET(PT_CS, pt_regs, cs);
91 OFFSET(PT_EFLAGS, pt_regs, eflags); 90 OFFSET(PT_EFLAGS, pt_regs, flags);
92 OFFSET(PT_OLDESP, pt_regs, esp); 91 OFFSET(PT_OLDESP, pt_regs, sp);
93 OFFSET(PT_OLDSS, pt_regs, xss); 92 OFFSET(PT_OLDSS, pt_regs, ss);
94 BLANK(); 93 BLANK();
95 94
96 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 95 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
97 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 96 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
98 BLANK(); 97 BLANK();
99 98
100 OFFSET(pbe_address, pbe, address); 99 OFFSET(pbe_address, pbe, address);
101 OFFSET(pbe_orig_address, pbe, orig_address); 100 OFFSET(pbe_orig_address, pbe, orig_address);
102 OFFSET(pbe_next, pbe, next); 101 OFFSET(pbe_next, pbe, next);
103 102
104 /* Offset from the sysenter stack to tss.esp0 */ 103 /* Offset from the sysenter stack to tss.sp0 */
105 DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - 104 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
106 sizeof(struct tss_struct)); 105 sizeof(struct tss_struct));
107 106
108 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 107 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
@@ -111,8 +110,6 @@ void foo(void)
111 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); 110 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
112 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); 111 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
113 112
114 DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK);
115
116 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 113 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
117 114
118#ifdef CONFIG_PARAVIRT 115#ifdef CONFIG_PARAVIRT
@@ -123,7 +120,7 @@ void foo(void)
123 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); 120 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
124 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); 121 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
125 OFFSET(PV_CPU_iret, pv_cpu_ops, iret); 122 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
126 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); 123 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
127 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); 124 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
128#endif 125#endif
129 126
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d1b6ed98774e..494e1e096ee6 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -38,7 +38,6 @@ int main(void)
38#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) 38#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
39 ENTRY(state); 39 ENTRY(state);
40 ENTRY(flags); 40 ENTRY(flags);
41 ENTRY(thread);
42 ENTRY(pid); 41 ENTRY(pid);
43 BLANK(); 42 BLANK();
44#undef ENTRY 43#undef ENTRY
@@ -47,6 +46,9 @@ int main(void)
47 ENTRY(addr_limit); 46 ENTRY(addr_limit);
48 ENTRY(preempt_count); 47 ENTRY(preempt_count);
49 ENTRY(status); 48 ENTRY(status);
49#ifdef CONFIG_IA32_EMULATION
50 ENTRY(sysenter_return);
51#endif
50 BLANK(); 52 BLANK();
51#undef ENTRY 53#undef ENTRY
52#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) 54#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
@@ -59,17 +61,31 @@ int main(void)
59 ENTRY(data_offset); 61 ENTRY(data_offset);
60 BLANK(); 62 BLANK();
61#undef ENTRY 63#undef ENTRY
64#ifdef CONFIG_PARAVIRT
65 BLANK();
66 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
67 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
68 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
69 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
70 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
71 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
72 OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
73 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
74 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
75#endif
76
77
62#ifdef CONFIG_IA32_EMULATION 78#ifdef CONFIG_IA32_EMULATION
63#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 79#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
64 ENTRY(eax); 80 ENTRY(ax);
65 ENTRY(ebx); 81 ENTRY(bx);
66 ENTRY(ecx); 82 ENTRY(cx);
67 ENTRY(edx); 83 ENTRY(dx);
68 ENTRY(esi); 84 ENTRY(si);
69 ENTRY(edi); 85 ENTRY(di);
70 ENTRY(ebp); 86 ENTRY(bp);
71 ENTRY(esp); 87 ENTRY(sp);
72 ENTRY(eip); 88 ENTRY(ip);
73 BLANK(); 89 BLANK();
74#undef ENTRY 90#undef ENTRY
75 DEFINE(IA32_RT_SIGFRAME_sigcontext, 91 DEFINE(IA32_RT_SIGFRAME_sigcontext,
@@ -81,14 +97,14 @@ int main(void)
81 DEFINE(pbe_next, offsetof(struct pbe, next)); 97 DEFINE(pbe_next, offsetof(struct pbe, next));
82 BLANK(); 98 BLANK();
83#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) 99#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
84 ENTRY(rbx); 100 ENTRY(bx);
85 ENTRY(rbx); 101 ENTRY(bx);
86 ENTRY(rcx); 102 ENTRY(cx);
87 ENTRY(rdx); 103 ENTRY(dx);
88 ENTRY(rsp); 104 ENTRY(sp);
89 ENTRY(rbp); 105 ENTRY(bp);
90 ENTRY(rsi); 106 ENTRY(si);
91 ENTRY(rdi); 107 ENTRY(di);
92 ENTRY(r8); 108 ENTRY(r8);
93 ENTRY(r9); 109 ENTRY(r9);
94 ENTRY(r10); 110 ENTRY(r10);
@@ -97,7 +113,7 @@ int main(void)
97 ENTRY(r13); 113 ENTRY(r13);
98 ENTRY(r14); 114 ENTRY(r14);
99 ENTRY(r15); 115 ENTRY(r15);
100 ENTRY(eflags); 116 ENTRY(flags);
101 BLANK(); 117 BLANK();
102#undef ENTRY 118#undef ENTRY
103#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 119#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry))
@@ -108,7 +124,7 @@ int main(void)
108 ENTRY(cr8); 124 ENTRY(cr8);
109 BLANK(); 125 BLANK();
110#undef ENTRY 126#undef ENTRY
111 DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); 127 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
112 BLANK(); 128 BLANK();
113 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); 129 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
114 BLANK(); 130 BLANK();
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 0b9860530a6b..30f25a75fe28 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * Implement 'Simple Boot Flag Specification 2.0' 2 * Implement 'Simple Boot Flag Specification 2.0'
3 */ 3 */
4
5
6#include <linux/types.h> 4#include <linux/types.h>
7#include <linux/kernel.h> 5#include <linux/kernel.h>
8#include <linux/init.h> 6#include <linux/init.h>
@@ -14,40 +12,38 @@
14 12
15#include <linux/mc146818rtc.h> 13#include <linux/mc146818rtc.h>
16 14
17
18#define SBF_RESERVED (0x78) 15#define SBF_RESERVED (0x78)
19#define SBF_PNPOS (1<<0) 16#define SBF_PNPOS (1<<0)
20#define SBF_BOOTING (1<<1) 17#define SBF_BOOTING (1<<1)
21#define SBF_DIAG (1<<2) 18#define SBF_DIAG (1<<2)
22#define SBF_PARITY (1<<7) 19#define SBF_PARITY (1<<7)
23 20
24
25int sbf_port __initdata = -1; /* set via acpi_boot_init() */ 21int sbf_port __initdata = -1; /* set via acpi_boot_init() */
26 22
27
28static int __init parity(u8 v) 23static int __init parity(u8 v)
29{ 24{
30 int x = 0; 25 int x = 0;
31 int i; 26 int i;
32 27
33 for(i=0;i<8;i++) 28 for (i = 0; i < 8; i++) {
34 { 29 x ^= (v & 1);
35 x^=(v&1); 30 v >>= 1;
36 v>>=1;
37 } 31 }
32
38 return x; 33 return x;
39} 34}
40 35
41static void __init sbf_write(u8 v) 36static void __init sbf_write(u8 v)
42{ 37{
43 unsigned long flags; 38 unsigned long flags;
44 if(sbf_port != -1) 39
45 { 40 if (sbf_port != -1) {
46 v &= ~SBF_PARITY; 41 v &= ~SBF_PARITY;
47 if(!parity(v)) 42 if (!parity(v))
48 v|=SBF_PARITY; 43 v |= SBF_PARITY;
49 44
50 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); 45 printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n",
46 sbf_port, v);
51 47
52 spin_lock_irqsave(&rtc_lock, flags); 48 spin_lock_irqsave(&rtc_lock, flags);
53 CMOS_WRITE(v, sbf_port); 49 CMOS_WRITE(v, sbf_port);
@@ -57,33 +53,41 @@ static void __init sbf_write(u8 v)
57 53
58static u8 __init sbf_read(void) 54static u8 __init sbf_read(void)
59{ 55{
60 u8 v;
61 unsigned long flags; 56 unsigned long flags;
62 if(sbf_port == -1) 57 u8 v;
58
59 if (sbf_port == -1)
63 return 0; 60 return 0;
61
64 spin_lock_irqsave(&rtc_lock, flags); 62 spin_lock_irqsave(&rtc_lock, flags);
65 v = CMOS_READ(sbf_port); 63 v = CMOS_READ(sbf_port);
66 spin_unlock_irqrestore(&rtc_lock, flags); 64 spin_unlock_irqrestore(&rtc_lock, flags);
65
67 return v; 66 return v;
68} 67}
69 68
70static int __init sbf_value_valid(u8 v) 69static int __init sbf_value_valid(u8 v)
71{ 70{
72 if(v&SBF_RESERVED) /* Reserved bits */ 71 if (v & SBF_RESERVED) /* Reserved bits */
73 return 0; 72 return 0;
74 if(!parity(v)) 73 if (!parity(v))
75 return 0; 74 return 0;
75
76 return 1; 76 return 1;
77} 77}
78 78
79static int __init sbf_init(void) 79static int __init sbf_init(void)
80{ 80{
81 u8 v; 81 u8 v;
82 if(sbf_port == -1) 82
83 if (sbf_port == -1)
83 return 0; 84 return 0;
85
84 v = sbf_read(); 86 v = sbf_read();
85 if(!sbf_value_valid(v)) 87 if (!sbf_value_valid(v)) {
86 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); 88 printk(KERN_WARNING "Simple Boot Flag value 0x%x read from "
89 "CMOS RAM was invalid\n", v);
90 }
87 91
88 v &= ~SBF_RESERVED; 92 v &= ~SBF_RESERVED;
89 v &= ~SBF_BOOTING; 93 v &= ~SBF_BOOTING;
@@ -92,7 +96,7 @@ static int __init sbf_init(void)
92 v |= SBF_PNPOS; 96 v |= SBF_PNPOS;
93#endif 97#endif
94 sbf_write(v); 98 sbf_write(v);
99
95 return 0; 100 return 0;
96} 101}
97
98module_init(sbf_init); 102module_init(sbf_init);
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
index 9a189cef6404..8f520f93ffd4 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/bugs_64.c
@@ -13,7 +13,6 @@
13void __init check_bugs(void) 13void __init check_bugs(void)
14{ 14{
15 identify_cpu(&boot_cpu_data); 15 identify_cpu(&boot_cpu_data);
16 mtrr_bp_init();
17#if !defined(CONFIG_SMP) 16#if !defined(CONFIG_SMP)
18 printk("CPU: "); 17 printk("CPU: ");
19 print_cpu_info(&boot_cpu_data); 18 print_cpu_info(&boot_cpu_data);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index cfdb2f3bd763..a0c4d7c5dbd7 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,6 +3,7 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += feature_names.o
6 7
7obj-$(CONFIG_X86_32) += common.o proc.o bugs.o 8obj-$(CONFIG_X86_32) += common.o proc.o bugs.o
8obj-$(CONFIG_X86_32) += amd.o 9obj-$(CONFIG_X86_32) += amd.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 3e91d3ee26ec..238468ae1993 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
45 &regs[CR_ECX], &regs[CR_EDX]); 45 &regs[CR_ECX], &regs[CR_EDX]);
46 46
47 if (regs[cb->reg] & (1 << cb->bit)) 47 if (regs[cb->reg] & (1 << cb->bit))
48 set_bit(cb->feature, c->x86_capability); 48 set_cpu_cap(c, cb->feature);
49 } 49 }
50} 50}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1ff88c7f45cf..693e353999cd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void)
63 63
64int force_mwait __cpuinitdata; 64int force_mwait __cpuinitdata;
65 65
66void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
67{
68 if (cpuid_eax(0x80000000) >= 0x80000007) {
69 c->x86_power = cpuid_edx(0x80000007);
70 if (c->x86_power & (1<<8))
71 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
72 }
73}
74
66static void __cpuinit init_amd(struct cpuinfo_x86 *c) 75static void __cpuinit init_amd(struct cpuinfo_x86 *c)
67{ 76{
68 u32 l, h; 77 u32 l, h;
@@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
85 } 94 }
86#endif 95#endif
87 96
97 early_init_amd(c);
98
88 /* 99 /*
89 * FIXME: We should handle the K5 here. Set up the write 100 * FIXME: We should handle the K5 here. Set up the write
90 * range and also turn on MSR 83 bits 4 and 31 (write alloc, 101 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
@@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
257 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 268 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
258 } 269 }
259 270
260 if (cpuid_eax(0x80000000) >= 0x80000007) {
261 c->x86_power = cpuid_edx(0x80000007);
262 if (c->x86_power & (1<<8))
263 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
264 }
265
266#ifdef CONFIG_X86_HT 271#ifdef CONFIG_X86_HT
267 /* 272 /*
268 * On a AMD multi core setup the lower bits of the APIC id 273 * On a AMD multi core setup the lower bits of the APIC id
@@ -295,12 +300,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
295 local_apic_timer_disabled = 1; 300 local_apic_timer_disabled = 1;
296#endif 301#endif
297 302
298 if (c->x86 == 0x10 && !force_mwait)
299 clear_bit(X86_FEATURE_MWAIT, c->x86_capability);
300
301 /* K6s reports MCEs but don't actually have all the MSRs */ 303 /* K6s reports MCEs but don't actually have all the MSRs */
302 if (c->x86 < 6) 304 if (c->x86 < 6)
303 clear_bit(X86_FEATURE_MCE, c->x86_capability); 305 clear_bit(X86_FEATURE_MCE, c->x86_capability);
306
307 if (cpu_has_xmm2)
308 set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
304} 309}
305 310
306static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) 311static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 205fd5ba57f7..9b95edcfc6ae 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -11,6 +11,7 @@
11#include <linux/utsname.h> 11#include <linux/utsname.h>
12#include <asm/bugs.h> 12#include <asm/bugs.h>
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <asm/processor-flags.h>
14#include <asm/i387.h> 15#include <asm/i387.h>
15#include <asm/msr.h> 16#include <asm/msr.h>
16#include <asm/paravirt.h> 17#include <asm/paravirt.h>
@@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium);
35static int __init no_387(char *s) 36static int __init no_387(char *s)
36{ 37{
37 boot_cpu_data.hard_math = 0; 38 boot_cpu_data.hard_math = 0;
38 write_cr0(0xE | read_cr0()); 39 write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0());
39 return 1; 40 return 1;
40} 41}
41 42
@@ -153,7 +154,7 @@ static void __init check_config(void)
153 * If we configured ourselves for a TSC, we'd better have one! 154 * If we configured ourselves for a TSC, we'd better have one!
154 */ 155 */
155#ifdef CONFIG_X86_TSC 156#ifdef CONFIG_X86_TSC
156 if (!cpu_has_tsc && !tsc_disable) 157 if (!cpu_has_tsc)
157 panic("Kernel compiled for Pentium+, requires TSC feature!"); 158 panic("Kernel compiled for Pentium+, requires TSC feature!");
158#endif 159#endif
159 160
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2fcf2051bdb..f86a3c4a2669 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -22,43 +22,48 @@
22#include "cpu.h" 22#include "cpu.h"
23 23
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, 25 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
26 [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, 26 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, 27 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
28 [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, 28 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } },
29 /* 29 /*
30 * Segments used for calling PnP BIOS have byte granularity. 30 * Segments used for calling PnP BIOS have byte granularity.
31 * They code segments and data segments have fixed 64k limits, 31 * They code segments and data segments have fixed 64k limits,
32 * the transfer segment sizes are set at run time. 32 * the transfer segment sizes are set at run time.
33 */ 33 */
34 [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ 34 /* 32-bit code */
35 [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ 35 [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } },
36 [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ 36 /* 16-bit code */
37 [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ 37 [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } },
38 [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ 38 /* 16-bit data */
39 [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } },
40 /* 16-bit data */
41 [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } },
42 /* 16-bit data */
43 [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } },
39 /* 44 /*
40 * The APM segments have byte granularity and their bases 45 * The APM segments have byte granularity and their bases
41 * are set at run time. All have 64k limits. 46 * are set at run time. All have 64k limits.
42 */ 47 */
43 [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ 48 /* 32-bit code */
49 [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } },
44 /* 16-bit code */ 50 /* 16-bit code */
45 [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, 51 [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } },
46 [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ 52 /* data */
53 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
47 54
48 [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, 55 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
49 [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, 56 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
50} }; 57} };
51EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 58EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
52 59
60__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
61
53static int cachesize_override __cpuinitdata = -1; 62static int cachesize_override __cpuinitdata = -1;
54static int disable_x86_fxsr __cpuinitdata;
55static int disable_x86_serial_nr __cpuinitdata = 1; 63static int disable_x86_serial_nr __cpuinitdata = 1;
56static int disable_x86_sep __cpuinitdata;
57 64
58struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; 65struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
59 66
60extern int disable_pse;
61
62static void __cpuinit default_init(struct cpuinfo_x86 * c) 67static void __cpuinit default_init(struct cpuinfo_x86 * c)
63{ 68{
64 /* Not much we can do here... */ 69 /* Not much we can do here... */
@@ -207,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
207 212
208static int __init x86_fxsr_setup(char * s) 213static int __init x86_fxsr_setup(char * s)
209{ 214{
210 /* Tell all the other CPUs to not use it... */ 215 setup_clear_cpu_cap(X86_FEATURE_FXSR);
211 disable_x86_fxsr = 1; 216 setup_clear_cpu_cap(X86_FEATURE_XMM);
212
213 /*
214 * ... and clear the bits early in the boot_cpu_data
215 * so that the bootup process doesn't try to do this
216 * either.
217 */
218 clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability);
219 clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability);
220 return 1; 217 return 1;
221} 218}
222__setup("nofxsr", x86_fxsr_setup); 219__setup("nofxsr", x86_fxsr_setup);
@@ -224,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup);
224 221
225static int __init x86_sep_setup(char * s) 222static int __init x86_sep_setup(char * s)
226{ 223{
227 disable_x86_sep = 1; 224 setup_clear_cpu_cap(X86_FEATURE_SEP);
228 return 1; 225 return 1;
229} 226}
230__setup("nosep", x86_sep_setup); 227__setup("nosep", x86_sep_setup);
@@ -261,10 +258,10 @@ static int __cpuinit have_cpuid_p(void)
261void __init cpu_detect(struct cpuinfo_x86 *c) 258void __init cpu_detect(struct cpuinfo_x86 *c)
262{ 259{
263 /* Get vendor name */ 260 /* Get vendor name */
264 cpuid(0x00000000, &c->cpuid_level, 261 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
265 (int *)&c->x86_vendor_id[0], 262 (unsigned int *)&c->x86_vendor_id[0],
266 (int *)&c->x86_vendor_id[8], 263 (unsigned int *)&c->x86_vendor_id[8],
267 (int *)&c->x86_vendor_id[4]); 264 (unsigned int *)&c->x86_vendor_id[4]);
268 265
269 c->x86 = 4; 266 c->x86 = 4;
270 if (c->cpuid_level >= 0x00000001) { 267 if (c->cpuid_level >= 0x00000001) {
@@ -277,10 +274,39 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
277 if (c->x86 >= 0x6) 274 if (c->x86 >= 0x6)
278 c->x86_model += ((tfms >> 16) & 0xF) << 4; 275 c->x86_model += ((tfms >> 16) & 0xF) << 4;
279 c->x86_mask = tfms & 15; 276 c->x86_mask = tfms & 15;
280 if (cap0 & (1<<19)) 277 if (cap0 & (1<<19)) {
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; 278 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
279 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
280 }
282 } 281 }
283} 282}
283static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
284{
285 u32 tfms, xlvl;
286 unsigned int ebx;
287
288 memset(&c->x86_capability, 0, sizeof c->x86_capability);
289 if (have_cpuid_p()) {
290 /* Intel-defined flags: level 0x00000001 */
291 if (c->cpuid_level >= 0x00000001) {
292 u32 capability, excap;
293 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
294 c->x86_capability[0] = capability;
295 c->x86_capability[4] = excap;
296 }
297
298 /* AMD-defined flags: level 0x80000001 */
299 xlvl = cpuid_eax(0x80000000);
300 if ((xlvl & 0xffff0000) == 0x80000000) {
301 if (xlvl >= 0x80000001) {
302 c->x86_capability[1] = cpuid_edx(0x80000001);
303 c->x86_capability[6] = cpuid_ecx(0x80000001);
304 }
305 }
306
307 }
308
309}
284 310
285/* Do minimum CPU detection early. 311/* Do minimum CPU detection early.
286 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. 312 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
@@ -293,6 +319,7 @@ static void __init early_cpu_detect(void)
293 struct cpuinfo_x86 *c = &boot_cpu_data; 319 struct cpuinfo_x86 *c = &boot_cpu_data;
294 320
295 c->x86_cache_alignment = 32; 321 c->x86_cache_alignment = 32;
322 c->x86_clflush_size = 32;
296 323
297 if (!have_cpuid_p()) 324 if (!have_cpuid_p())
298 return; 325 return;
@@ -300,19 +327,30 @@ static void __init early_cpu_detect(void)
300 cpu_detect(c); 327 cpu_detect(c);
301 328
302 get_cpu_vendor(c, 1); 329 get_cpu_vendor(c, 1);
330
331 switch (c->x86_vendor) {
332 case X86_VENDOR_AMD:
333 early_init_amd(c);
334 break;
335 case X86_VENDOR_INTEL:
336 early_init_intel(c);
337 break;
338 }
339
340 early_get_cap(c);
303} 341}
304 342
305static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 343static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
306{ 344{
307 u32 tfms, xlvl; 345 u32 tfms, xlvl;
308 int ebx; 346 unsigned int ebx;
309 347
310 if (have_cpuid_p()) { 348 if (have_cpuid_p()) {
311 /* Get vendor name */ 349 /* Get vendor name */
312 cpuid(0x00000000, &c->cpuid_level, 350 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
313 (int *)&c->x86_vendor_id[0], 351 (unsigned int *)&c->x86_vendor_id[0],
314 (int *)&c->x86_vendor_id[8], 352 (unsigned int *)&c->x86_vendor_id[8],
315 (int *)&c->x86_vendor_id[4]); 353 (unsigned int *)&c->x86_vendor_id[4]);
316 354
317 get_cpu_vendor(c, 0); 355 get_cpu_vendor(c, 0);
318 /* Initialize the standard set of capabilities */ 356 /* Initialize the standard set of capabilities */
@@ -357,8 +395,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
357 init_scattered_cpuid_features(c); 395 init_scattered_cpuid_features(c);
358 } 396 }
359 397
360 early_intel_workaround(c);
361
362#ifdef CONFIG_X86_HT 398#ifdef CONFIG_X86_HT
363 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; 399 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
364#endif 400#endif
@@ -392,7 +428,7 @@ __setup("serialnumber", x86_serial_nr_setup);
392/* 428/*
393 * This does the hard work of actually picking apart the CPU stuff... 429 * This does the hard work of actually picking apart the CPU stuff...
394 */ 430 */
395static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) 431void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
396{ 432{
397 int i; 433 int i;
398 434
@@ -418,20 +454,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
418 454
419 generic_identify(c); 455 generic_identify(c);
420 456
421 printk(KERN_DEBUG "CPU: After generic identify, caps:"); 457 if (this_cpu->c_identify)
422 for (i = 0; i < NCAPINTS; i++)
423 printk(" %08lx", c->x86_capability[i]);
424 printk("\n");
425
426 if (this_cpu->c_identify) {
427 this_cpu->c_identify(c); 458 this_cpu->c_identify(c);
428 459
429 printk(KERN_DEBUG "CPU: After vendor identify, caps:");
430 for (i = 0; i < NCAPINTS; i++)
431 printk(" %08lx", c->x86_capability[i]);
432 printk("\n");
433 }
434
435 /* 460 /*
436 * Vendor-specific initialization. In this section we 461 * Vendor-specific initialization. In this section we
437 * canonicalize the feature flags, meaning if there are 462 * canonicalize the feature flags, meaning if there are
@@ -453,23 +478,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
453 * we do "generic changes." 478 * we do "generic changes."
454 */ 479 */
455 480
456 /* TSC disabled? */
457 if ( tsc_disable )
458 clear_bit(X86_FEATURE_TSC, c->x86_capability);
459
460 /* FXSR disabled? */
461 if (disable_x86_fxsr) {
462 clear_bit(X86_FEATURE_FXSR, c->x86_capability);
463 clear_bit(X86_FEATURE_XMM, c->x86_capability);
464 }
465
466 /* SEP disabled? */
467 if (disable_x86_sep)
468 clear_bit(X86_FEATURE_SEP, c->x86_capability);
469
470 if (disable_pse)
471 clear_bit(X86_FEATURE_PSE, c->x86_capability);
472
473 /* If the model name is still unset, do table lookup. */ 481 /* If the model name is still unset, do table lookup. */
474 if ( !c->x86_model_id[0] ) { 482 if ( !c->x86_model_id[0] ) {
475 char *p; 483 char *p;
@@ -482,13 +490,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
482 c->x86, c->x86_model); 490 c->x86, c->x86_model);
483 } 491 }
484 492
485 /* Now the feature flags better reflect actual CPU features! */
486
487 printk(KERN_DEBUG "CPU: After all inits, caps:");
488 for (i = 0; i < NCAPINTS; i++)
489 printk(" %08lx", c->x86_capability[i]);
490 printk("\n");
491
492 /* 493 /*
493 * On SMP, boot_cpu_data holds the common feature set between 494 * On SMP, boot_cpu_data holds the common feature set between
494 * all CPUs; so make sure that we indicate which features are 495 * all CPUs; so make sure that we indicate which features are
@@ -501,8 +502,14 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
501 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 502 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
502 } 503 }
503 504
505 /* Clear all flags overriden by options */
506 for (i = 0; i < NCAPINTS; i++)
507 c->x86_capability[i] ^= cleared_cpu_caps[i];
508
504 /* Init Machine Check Exception if available. */ 509 /* Init Machine Check Exception if available. */
505 mcheck_init(c); 510 mcheck_init(c);
511
512 select_idle_routine(c);
506} 513}
507 514
508void __init identify_boot_cpu(void) 515void __init identify_boot_cpu(void)
@@ -510,7 +517,6 @@ void __init identify_boot_cpu(void)
510 identify_cpu(&boot_cpu_data); 517 identify_cpu(&boot_cpu_data);
511 sysenter_setup(); 518 sysenter_setup();
512 enable_sep_cpu(); 519 enable_sep_cpu();
513 mtrr_bp_init();
514} 520}
515 521
516void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 522void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -567,6 +573,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
567} 573}
568#endif 574#endif
569 575
576static __init int setup_noclflush(char *arg)
577{
578 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
579 return 1;
580}
581__setup("noclflush", setup_noclflush);
582
570void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 583void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
571{ 584{
572 char *vendor = NULL; 585 char *vendor = NULL;
@@ -590,6 +603,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
590 printk("\n"); 603 printk("\n");
591} 604}
592 605
606static __init int setup_disablecpuid(char *arg)
607{
608 int bit;
609 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
610 setup_clear_cpu_cap(bit);
611 else
612 return 0;
613 return 1;
614}
615__setup("clearcpuid=", setup_disablecpuid);
616
593cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 617cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
594 618
595/* This is hacky. :) 619/* This is hacky. :)
@@ -599,16 +623,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
599 * They will insert themselves into the cpu_devs structure. 623 * They will insert themselves into the cpu_devs structure.
600 * Then, when cpu_init() is called, we can just iterate over that array. 624 * Then, when cpu_init() is called, we can just iterate over that array.
601 */ 625 */
602
603extern int intel_cpu_init(void);
604extern int cyrix_init_cpu(void);
605extern int nsc_init_cpu(void);
606extern int amd_init_cpu(void);
607extern int centaur_init_cpu(void);
608extern int transmeta_init_cpu(void);
609extern int nexgen_init_cpu(void);
610extern int umc_init_cpu(void);
611
612void __init early_cpu_init(void) 626void __init early_cpu_init(void)
613{ 627{
614 intel_cpu_init(); 628 intel_cpu_init();
@@ -620,21 +634,13 @@ void __init early_cpu_init(void)
620 nexgen_init_cpu(); 634 nexgen_init_cpu();
621 umc_init_cpu(); 635 umc_init_cpu();
622 early_cpu_detect(); 636 early_cpu_detect();
623
624#ifdef CONFIG_DEBUG_PAGEALLOC
625 /* pse is not compatible with on-the-fly unmapping,
626 * disable it even if the cpus claim to support it.
627 */
628 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
629 disable_pse = 1;
630#endif
631} 637}
632 638
633/* Make sure %fs is initialized properly in idle threads */ 639/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 640struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
635{ 641{
636 memset(regs, 0, sizeof(struct pt_regs)); 642 memset(regs, 0, sizeof(struct pt_regs));
637 regs->xfs = __KERNEL_PERCPU; 643 regs->fs = __KERNEL_PERCPU;
638 return regs; 644 return regs;
639} 645}
640 646
@@ -642,7 +648,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
642 * it's on the real one. */ 648 * it's on the real one. */
643void switch_to_new_gdt(void) 649void switch_to_new_gdt(void)
644{ 650{
645 struct Xgt_desc_struct gdt_descr; 651 struct desc_ptr gdt_descr;
646 652
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 653 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1; 654 gdt_descr.size = GDT_SIZE - 1;
@@ -672,12 +678,6 @@ void __cpuinit cpu_init(void)
672 678
673 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 679 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
674 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 680 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
675 if (tsc_disable && cpu_has_tsc) {
676 printk(KERN_NOTICE "Disabling TSC...\n");
677 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
678 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
679 set_in_cr4(X86_CR4_TSD);
680 }
681 681
682 load_idt(&idt_descr); 682 load_idt(&idt_descr);
683 switch_to_new_gdt(); 683 switch_to_new_gdt();
@@ -691,7 +691,7 @@ void __cpuinit cpu_init(void)
691 BUG(); 691 BUG();
692 enter_lazy_tlb(&init_mm, curr); 692 enter_lazy_tlb(&init_mm, curr);
693 693
694 load_esp0(t, thread); 694 load_sp0(t, thread);
695 set_tss_desc(cpu,t); 695 set_tss_desc(cpu,t);
696 load_TR_desc(); 696 load_TR_desc();
697 load_LDT(&init_mm.context); 697 load_LDT(&init_mm.context);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 2f6432cef6ff..e0b38c33d842 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -24,5 +24,15 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
24extern int get_model_name(struct cpuinfo_x86 *c); 24extern int get_model_name(struct cpuinfo_x86 *c);
25extern void display_cacheinfo(struct cpuinfo_x86 *c); 25extern void display_cacheinfo(struct cpuinfo_x86 *c);
26 26
27extern void early_intel_workaround(struct cpuinfo_x86 *c); 27extern void early_init_intel(struct cpuinfo_x86 *c);
28 28extern void early_init_amd(struct cpuinfo_x86 *c);
29
30/* Specific CPU type init functions */
31int intel_cpu_init(void);
32int amd_init_cpu(void);
33int cyrix_init_cpu(void);
34int nsc_init_cpu(void);
35int centaur_init_cpu(void);
36int transmeta_init_cpu(void);
37int nexgen_init_cpu(void);
38int umc_init_cpu(void);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index fea0af0476b9..a962dcb9c408 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -67,7 +67,8 @@ struct acpi_cpufreq_data {
67 unsigned int cpu_feature; 67 unsigned int cpu_feature;
68}; 68};
69 69
70static struct acpi_cpufreq_data *drv_data[NR_CPUS]; 70static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
71
71/* acpi_perf_data is a pointer to percpu data. */ 72/* acpi_perf_data is a pointer to percpu data. */
72static struct acpi_processor_performance *acpi_perf_data; 73static struct acpi_processor_performance *acpi_perf_data;
73 74
@@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask)
218 if (unlikely(cpus_empty(mask))) 219 if (unlikely(cpus_empty(mask)))
219 return 0; 220 return 0;
220 221
221 switch (drv_data[first_cpu(mask)]->cpu_feature) { 222 switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) {
222 case SYSTEM_INTEL_MSR_CAPABLE: 223 case SYSTEM_INTEL_MSR_CAPABLE:
223 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 224 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
224 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 225 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
225 break; 226 break;
226 case SYSTEM_IO_CAPABLE: 227 case SYSTEM_IO_CAPABLE:
227 cmd.type = SYSTEM_IO_CAPABLE; 228 cmd.type = SYSTEM_IO_CAPABLE;
228 perf = drv_data[first_cpu(mask)]->acpi_data; 229 perf = per_cpu(drv_data, first_cpu(mask))->acpi_data;
229 cmd.addr.io.port = perf->control_register.address; 230 cmd.addr.io.port = perf->control_register.address;
230 cmd.addr.io.bit_width = perf->control_register.bit_width; 231 cmd.addr.io.bit_width = perf->control_register.bit_width;
231 break; 232 break;
@@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
325 326
326#endif 327#endif
327 328
328 retval = drv_data[cpu]->max_freq * perf_percent / 100; 329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
329 330
330 put_cpu(); 331 put_cpu();
331 set_cpus_allowed(current, saved_mask); 332 set_cpus_allowed(current, saved_mask);
@@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
336 337
337static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 338static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
338{ 339{
339 struct acpi_cpufreq_data *data = drv_data[cpu]; 340 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
340 unsigned int freq; 341 unsigned int freq;
341 342
342 dprintk("get_cur_freq_on_cpu (%d)\n", cpu); 343 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
370static int acpi_cpufreq_target(struct cpufreq_policy *policy, 371static int acpi_cpufreq_target(struct cpufreq_policy *policy,
371 unsigned int target_freq, unsigned int relation) 372 unsigned int target_freq, unsigned int relation)
372{ 373{
373 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 374 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
374 struct acpi_processor_performance *perf; 375 struct acpi_processor_performance *perf;
375 struct cpufreq_freqs freqs; 376 struct cpufreq_freqs freqs;
376 cpumask_t online_policy_cpus; 377 cpumask_t online_policy_cpus;
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 467
467static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 468static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
468{ 469{
469 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 470 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
470 471
471 dprintk("acpi_cpufreq_verify\n"); 472 dprintk("acpi_cpufreq_verify\n");
472 473
@@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
570 return -ENOMEM; 571 return -ENOMEM;
571 572
572 data->acpi_data = percpu_ptr(acpi_perf_data, cpu); 573 data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
573 drv_data[cpu] = data; 574 per_cpu(drv_data, cpu) = data;
574 575
575 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 576 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
576 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 577 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +715,20 @@ err_unreg:
714 acpi_processor_unregister_performance(perf, cpu); 715 acpi_processor_unregister_performance(perf, cpu);
715err_free: 716err_free:
716 kfree(data); 717 kfree(data);
717 drv_data[cpu] = NULL; 718 per_cpu(drv_data, cpu) = NULL;
718 719
719 return result; 720 return result;
720} 721}
721 722
722static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 723static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
723{ 724{
724 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 725 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
725 726
726 dprintk("acpi_cpufreq_cpu_exit\n"); 727 dprintk("acpi_cpufreq_cpu_exit\n");
727 728
728 if (data) { 729 if (data) {
729 cpufreq_frequency_table_put_attr(policy->cpu); 730 cpufreq_frequency_table_put_attr(policy->cpu);
730 drv_data[policy->cpu] = NULL; 731 per_cpu(drv_data, policy->cpu) = NULL;
731 acpi_processor_unregister_performance(data->acpi_data, 732 acpi_processor_unregister_performance(data->acpi_data,
732 policy->cpu); 733 policy->cpu);
733 kfree(data); 734 kfree(data);
@@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
738 739
739static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 740static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
740{ 741{
741 struct acpi_cpufreq_data *data = drv_data[policy->cpu]; 742 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
742 743
743 dprintk("acpi_cpufreq_resume\n"); 744 dprintk("acpi_cpufreq_resume\n");
744 745
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 749d00cb2ebd..06fcce516d51 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
694 if ( acpi_bus_get_device(obj_handle, &d) ) { 694 if ( acpi_bus_get_device(obj_handle, &d) ) {
695 return 0; 695 return 0;
696 } 696 }
697 *return_value = (void *)acpi_driver_data(d); 697 *return_value = acpi_driver_data(d);
698 return 1; 698 return 1;
699} 699}
700 700
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 99e1ef9939be..5affe91ca1e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -52,7 +52,7 @@
52/* serialize freq changes */ 52/* serialize freq changes */
53static DEFINE_MUTEX(fidvid_mutex); 53static DEFINE_MUTEX(fidvid_mutex);
54 54
55static struct powernow_k8_data *powernow_data[NR_CPUS]; 55static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
56 56
57static int cpu_family = CPU_OPTERON; 57static int cpu_family = CPU_OPTERON;
58 58
@@ -827,7 +827,6 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf
827 827
828 for (i = 0; i < data->acpi_data.state_count; i++) { 828 for (i = 0; i < data->acpi_data.state_count; i++) {
829 u32 index; 829 u32 index;
830 u32 hi = 0, lo = 0;
831 830
832 index = data->acpi_data.states[i].control & HW_PSTATE_MASK; 831 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
833 if (index > data->max_hw_pstate) { 832 if (index > data->max_hw_pstate) {
@@ -1018,7 +1017,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1018static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) 1017static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
1019{ 1018{
1020 cpumask_t oldmask = CPU_MASK_ALL; 1019 cpumask_t oldmask = CPU_MASK_ALL;
1021 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1020 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1022 u32 checkfid; 1021 u32 checkfid;
1023 u32 checkvid; 1022 u32 checkvid;
1024 unsigned int newstate; 1023 unsigned int newstate;
@@ -1094,7 +1093,7 @@ err_out:
1094/* Driver entry point to verify the policy and range of frequencies */ 1093/* Driver entry point to verify the policy and range of frequencies */
1095static int powernowk8_verify(struct cpufreq_policy *pol) 1094static int powernowk8_verify(struct cpufreq_policy *pol)
1096{ 1095{
1097 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1096 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1098 1097
1099 if (!data) 1098 if (!data)
1100 return -EINVAL; 1099 return -EINVAL;
@@ -1202,7 +1201,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1202 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", 1201 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1203 data->currfid, data->currvid); 1202 data->currfid, data->currvid);
1204 1203
1205 powernow_data[pol->cpu] = data; 1204 per_cpu(powernow_data, pol->cpu) = data;
1206 1205
1207 return 0; 1206 return 0;
1208 1207
@@ -1216,7 +1215,7 @@ err_out:
1216 1215
1217static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) 1216static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
1218{ 1217{
1219 struct powernow_k8_data *data = powernow_data[pol->cpu]; 1218 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1220 1219
1221 if (!data) 1220 if (!data)
1222 return -EINVAL; 1221 return -EINVAL;
@@ -1237,7 +1236,7 @@ static unsigned int powernowk8_get (unsigned int cpu)
1237 cpumask_t oldmask = current->cpus_allowed; 1236 cpumask_t oldmask = current->cpus_allowed;
1238 unsigned int khz = 0; 1237 unsigned int khz = 0;
1239 1238
1240 data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))]; 1239 data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu)));
1241 1240
1242 if (!data) 1241 if (!data)
1243 return -EINVAL; 1242 return -EINVAL;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index 76c3ab0da468..98d4fdb7dc04 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -189,10 +189,7 @@ static unsigned int pentium4_get_frequency(void)
189 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n"); 189 printk(KERN_DEBUG "speedstep-lib: couldn't detect FSB speed. Please send an e-mail to <linux@brodo.de>\n");
190 190
191 /* Multiplier. */ 191 /* Multiplier. */
192 if (c->x86_model < 2) 192 mult = msr_lo >> 24;
193 mult = msr_lo >> 27;
194 else
195 mult = msr_lo >> 24;
196 193
197 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult)); 194 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", fsb, mult, (fsb * mult));
198 195
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 88d66fb8411d..7139b0262703 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -5,6 +5,7 @@
5#include <asm/dma.h> 5#include <asm/dma.h>
6#include <asm/io.h> 6#include <asm/io.h>
7#include <asm/processor-cyrix.h> 7#include <asm/processor-cyrix.h>
8#include <asm/processor-flags.h>
8#include <asm/timer.h> 9#include <asm/timer.h>
9#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
10#include <asm/tsc.h> 11#include <asm/tsc.h>
@@ -82,8 +83,6 @@ static char cyrix_model_mult2[] __cpuinitdata = "12233445";
82 * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP 83 * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP
83 */ 84 */
84 85
85extern void calibrate_delay(void) __init;
86
87static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) 86static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
88{ 87{
89 unsigned long flags; 88 unsigned long flags;
@@ -126,15 +125,12 @@ static void __cpuinit set_cx86_reorder(void)
126 125
127static void __cpuinit set_cx86_memwb(void) 126static void __cpuinit set_cx86_memwb(void)
128{ 127{
129 u32 cr0;
130
131 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 128 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
132 129
133 /* CCR2 bit 2: unlock NW bit */ 130 /* CCR2 bit 2: unlock NW bit */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 131 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
135 /* set 'Not Write-through' */ 132 /* set 'Not Write-through' */
136 cr0 = 0x20000000; 133 write_cr0(read_cr0() | X86_CR0_NW);
137 write_cr0(read_cr0() | cr0);
138 /* CCR2 bit 2: lock NW bit and set WT1 */ 134 /* CCR2 bit 2: lock NW bit and set WT1 */
139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); 135 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
140} 136}
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
new file mode 100644
index 000000000000..ee975ac6bbcb
--- /dev/null
+++ b/arch/x86/kernel/cpu/feature_names.c
@@ -0,0 +1,83 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include "asm/cpufeature.h"
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44
45 /* Intel-defined (#2) */
46 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
47 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
48 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
49 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
50
51 /* VIA/Cyrix/Centaur-defined */
52 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
53 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
54 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56
57 /* AMD-defined (#2) */
58 "lahf_lm", "cmp_legacy", "svm", "extapic",
59 "cr8_legacy", "abm", "sse4a", "misalignsse",
60 "3dnowprefetch", "osvw", "ibs", "sse5",
61 "skinit", "wdt", NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64
65 /* Auxiliary (Linux-defined) */
66 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70};
71
72const char *const x86_power_flags[32] = {
73 "ts", /* temperature sensor */
74 "fid", /* frequency id control */
75 "vid", /* voltage id control */
76 "ttp", /* thermal trip */
77 "tm",
78 "stc",
79 "100mhzsteps",
80 "hwpstate",
81 "", /* tsc invariant mapped to constant_tsc */
82 /* nothing */
83};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cc8c501b9f39..fae31ce747bd 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -11,6 +11,9 @@
11#include <asm/pgtable.h> 11#include <asm/pgtable.h>
12#include <asm/msr.h> 12#include <asm/msr.h>
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14#include <asm/ptrace.h>
15#include <asm/ds.h>
16#include <asm/bugs.h>
14 17
15#include "cpu.h" 18#include "cpu.h"
16 19
@@ -27,13 +30,14 @@
27struct movsl_mask movsl_mask __read_mostly; 30struct movsl_mask movsl_mask __read_mostly;
28#endif 31#endif
29 32
30void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c) 33void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 34{
32 if (c->x86_vendor != X86_VENDOR_INTEL)
33 return;
34 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ 35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
35 if (c->x86 == 15 && c->x86_cache_alignment == 64) 36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
36 c->x86_cache_alignment = 128; 37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
37} 41}
38 42
39/* 43/*
@@ -113,6 +117,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
113 unsigned int l2 = 0; 117 unsigned int l2 = 0;
114 char *p = NULL; 118 char *p = NULL;
115 119
120 early_init_intel(c);
121
116#ifdef CONFIG_X86_F00F_BUG 122#ifdef CONFIG_X86_F00F_BUG
117 /* 123 /*
118 * All current models of Pentium and Pentium with MMX technology CPUs 124 * All current models of Pentium and Pentium with MMX technology CPUs
@@ -132,7 +138,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
132 } 138 }
133#endif 139#endif
134 140
135 select_idle_routine(c);
136 l2 = init_intel_cacheinfo(c); 141 l2 = init_intel_cacheinfo(c);
137 if (c->cpuid_level > 9 ) { 142 if (c->cpuid_level > 9 ) {
138 unsigned eax = cpuid_eax(10); 143 unsigned eax = cpuid_eax(10);
@@ -201,16 +206,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
201 } 206 }
202#endif 207#endif
203 208
209 if (cpu_has_xmm2)
210 set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability);
204 if (c->x86 == 15) { 211 if (c->x86 == 15) {
205 set_bit(X86_FEATURE_P4, c->x86_capability); 212 set_bit(X86_FEATURE_P4, c->x86_capability);
206 set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability);
207 } 213 }
208 if (c->x86 == 6) 214 if (c->x86 == 6)
209 set_bit(X86_FEATURE_P3, c->x86_capability); 215 set_bit(X86_FEATURE_P3, c->x86_capability);
210 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
211 (c->x86 == 0x6 && c->x86_model >= 0x0e))
212 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
213
214 if (cpu_has_ds) { 216 if (cpu_has_ds) {
215 unsigned int l1; 217 unsigned int l1;
216 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); 218 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
@@ -219,6 +221,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
219 if (!(l1 & (1<<12))) 221 if (!(l1 & (1<<12)))
220 set_bit(X86_FEATURE_PEBS, c->x86_capability); 222 set_bit(X86_FEATURE_PEBS, c->x86_capability);
221 } 223 }
224
225 if (cpu_has_bts)
226 ds_init_intel(c);
222} 227}
223 228
224static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 229static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
@@ -342,5 +347,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
342EXPORT_SYMBOL(cmpxchg_386_u32); 347EXPORT_SYMBOL(cmpxchg_386_u32);
343#endif 348#endif
344 349
350#ifndef CONFIG_X86_CMPXCHG64
351unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
352{
353 u64 prev;
354 unsigned long flags;
355
356 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
357 local_irq_save(flags);
358 prev = *(u64 *)ptr;
359 if (prev == old)
360 *(u64 *)ptr = new;
361 local_irq_restore(flags);
362 return prev;
363}
364EXPORT_SYMBOL(cmpxchg_486_u64);
365#endif
366
345// arch_initcall(intel_cpu_init); 367// arch_initcall(intel_cpu_init);
346 368
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 9f530ff43c21..1b889860eb73 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -352,8 +352,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
352 */ 352 */
353 if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { 353 if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) {
354 /* supports eax=2 call */ 354 /* supports eax=2 call */
355 int i, j, n; 355 int j, n;
356 int regs[4]; 356 unsigned int regs[4];
357 unsigned char *dp = (unsigned char *)regs; 357 unsigned char *dp = (unsigned char *)regs;
358 int only_trace = 0; 358 int only_trace = 0;
359 359
@@ -368,7 +368,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
368 368
369 /* If bit 31 is set, this is an unknown format */ 369 /* If bit 31 is set, this is an unknown format */
370 for ( j = 0 ; j < 3 ; j++ ) { 370 for ( j = 0 ; j < 3 ; j++ ) {
371 if ( regs[j] < 0 ) regs[j] = 0; 371 if (regs[j] & (1 << 31)) regs[j] = 0;
372 } 372 }
373 373
374 /* Byte 0 is level count, not a descriptor */ 374 /* Byte 0 is level count, not a descriptor */
@@ -733,10 +733,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
733 if (unlikely(retval < 0)) 733 if (unlikely(retval < 0))
734 return retval; 734 return retval;
735 735
736 cache_kobject[cpu]->parent = &sys_dev->kobj; 736 retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry,
737 kobject_set_name(cache_kobject[cpu], "%s", "cache"); 737 &sys_dev->kobj, "%s", "cache");
738 cache_kobject[cpu]->ktype = &ktype_percpu_entry;
739 retval = kobject_register(cache_kobject[cpu]);
740 if (retval < 0) { 738 if (retval < 0) {
741 cpuid4_cache_sysfs_exit(cpu); 739 cpuid4_cache_sysfs_exit(cpu);
742 return retval; 740 return retval;
@@ -746,23 +744,23 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
746 this_object = INDEX_KOBJECT_PTR(cpu,i); 744 this_object = INDEX_KOBJECT_PTR(cpu,i);
747 this_object->cpu = cpu; 745 this_object->cpu = cpu;
748 this_object->index = i; 746 this_object->index = i;
749 this_object->kobj.parent = cache_kobject[cpu]; 747 retval = kobject_init_and_add(&(this_object->kobj),
750 kobject_set_name(&(this_object->kobj), "index%1lu", i); 748 &ktype_cache, cache_kobject[cpu],
751 this_object->kobj.ktype = &ktype_cache; 749 "index%1lu", i);
752 retval = kobject_register(&(this_object->kobj));
753 if (unlikely(retval)) { 750 if (unlikely(retval)) {
754 for (j = 0; j < i; j++) { 751 for (j = 0; j < i; j++) {
755 kobject_unregister( 752 kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
756 &(INDEX_KOBJECT_PTR(cpu,j)->kobj));
757 } 753 }
758 kobject_unregister(cache_kobject[cpu]); 754 kobject_put(cache_kobject[cpu]);
759 cpuid4_cache_sysfs_exit(cpu); 755 cpuid4_cache_sysfs_exit(cpu);
760 break; 756 break;
761 } 757 }
758 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
762 } 759 }
763 if (!retval) 760 if (!retval)
764 cpu_set(cpu, cache_dev_map); 761 cpu_set(cpu, cache_dev_map);
765 762
763 kobject_uevent(cache_kobject[cpu], KOBJ_ADD);
766 return retval; 764 return retval;
767} 765}
768 766
@@ -778,8 +776,8 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
778 cpu_clear(cpu, cache_dev_map); 776 cpu_clear(cpu, cache_dev_map);
779 777
780 for (i = 0; i < num_cache_leaves; i++) 778 for (i = 0; i < num_cache_leaves; i++)
781 kobject_unregister(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 779 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
782 kobject_unregister(cache_kobject[cpu]); 780 kobject_put(cache_kobject[cpu]);
783 cpuid4_cache_sysfs_exit(cpu); 781 cpuid4_cache_sysfs_exit(cpu);
784} 782}
785 783
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index eef63e3630c2..e633c9c2b764 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For AMD Athlon/Duron */ 18/* Machine Check Handler For AMD Athlon/Duron */
19static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) 19static void k7_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover=1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
@@ -27,29 +27,32 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover=0;
29 29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
32 32
33 for (i=1; i<nr_mce_banks; i++) { 33 for (i = 1; i < nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high&(1<<31)) { 35 if (high&(1<<31)) {
36 char misc[20];
37 char addr[24];
38 misc[0] = addr[0] = '\0';
36 if (high & (1<<29)) 39 if (high & (1<<29))
37 recover |= 1; 40 recover |= 1;
38 if (high & (1<<25)) 41 if (high & (1<<25))
39 recover |= 2; 42 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31); 43 high &= ~(1<<31);
42 if (high & (1<<27)) { 44 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow); 46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
45 } 47 }
46 if (high & (1<<26)) { 48 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow); 50 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
49 } 51 }
50 printk ("\n"); 52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr);
51 /* Clear it */ 54 /* Clear it */
52 wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); 55 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
53 /* Serialize */ 56 /* Serialize */
54 wmb(); 57 wmb();
55 add_taint(TAINT_MACHINE_CHECK); 58 add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h
index 81fb6e2d35f3..ae9f628838f1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.h
+++ b/arch/x86/kernel/cpu/mcheck/mce.h
@@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
8void winchip_mcheck_init(struct cpuinfo_x86 *c); 8void winchip_mcheck_init(struct cpuinfo_x86 *c);
9 9
10/* Call the installed machine check handler for this CPU setup. */ 10/* Call the installed machine check handler for this CPU setup. */
11extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); 11extern void (*machine_check_vector)(struct pt_regs *, long error_code);
12 12
13extern int nr_mce_banks; 13extern int nr_mce_banks;
14 14
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 34c781eddee4..a5182dcd94ae 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -22,13 +22,13 @@ int nr_mce_banks;
22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 22EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
23 23
24/* Handle unconfigured int18 (should never happen) */ 24/* Handle unconfigured int18 (should never happen) */
25static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) 25static void unexpected_machine_check(struct pt_regs * regs, long error_code)
26{ 26{
27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); 27 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
28} 28}
29 29
30/* Call the installed machine check handler for this CPU setup. */ 30/* Call the installed machine check handler for this CPU setup. */
31void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; 31void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
32 32
33/* This has to be run for each processor */ 33/* This has to be run for each processor */
34void mcheck_init(struct cpuinfo_x86 *c) 34void mcheck_init(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 4b21d29fb5aa..9a699ed03598 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
63 * separate MCEs from kernel messages to avoid bogus bug reports. 63 * separate MCEs from kernel messages to avoid bogus bug reports.
64 */ 64 */
65 65
66struct mce_log mcelog = { 66static struct mce_log mcelog = {
67 MCE_LOG_SIGNATURE, 67 MCE_LOG_SIGNATURE,
68 MCE_LOG_LEN, 68 MCE_LOG_LEN,
69}; 69};
@@ -80,7 +80,7 @@ void mce_log(struct mce *mce)
80 /* When the buffer fills up discard new entries. Assume 80 /* When the buffer fills up discard new entries. Assume
81 that the earlier errors are the more interesting. */ 81 that the earlier errors are the more interesting. */
82 if (entry >= MCE_LOG_LEN) { 82 if (entry >= MCE_LOG_LEN) {
83 set_bit(MCE_OVERFLOW, &mcelog.flags); 83 set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
84 return; 84 return;
85 } 85 }
86 /* Old left over entry. Skip. */ 86 /* Old left over entry. Skip. */
@@ -110,12 +110,12 @@ static void print_mce(struct mce *m)
110 KERN_EMERG 110 KERN_EMERG
111 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", 111 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
112 m->cpu, m->mcgstatus, m->bank, m->status); 112 m->cpu, m->mcgstatus, m->bank, m->status);
113 if (m->rip) { 113 if (m->ip) {
114 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 114 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
115 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 115 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
116 m->cs, m->rip); 116 m->cs, m->ip);
117 if (m->cs == __KERNEL_CS) 117 if (m->cs == __KERNEL_CS)
118 print_symbol("{%s}", m->rip); 118 print_symbol("{%s}", m->ip);
119 printk("\n"); 119 printk("\n");
120 } 120 }
121 printk(KERN_EMERG "TSC %Lx ", m->tsc); 121 printk(KERN_EMERG "TSC %Lx ", m->tsc);
@@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c)
156static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) 156static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
157{ 157{
158 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { 158 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
159 m->rip = regs->rip; 159 m->ip = regs->ip;
160 m->cs = regs->cs; 160 m->cs = regs->cs;
161 } else { 161 } else {
162 m->rip = 0; 162 m->ip = 0;
163 m->cs = 0; 163 m->cs = 0;
164 } 164 }
165 if (rip_msr) { 165 if (rip_msr) {
166 /* Assume the RIP in the MSR is exact. Is this true? */ 166 /* Assume the RIP in the MSR is exact. Is this true? */
167 m->mcgstatus |= MCG_STATUS_EIPV; 167 m->mcgstatus |= MCG_STATUS_EIPV;
168 rdmsrl(rip_msr, m->rip); 168 rdmsrl(rip_msr, m->ip);
169 m->cs = 0; 169 m->cs = 0;
170 } 170 }
171} 171}
@@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code)
192 192
193 atomic_inc(&mce_entry); 193 atomic_inc(&mce_entry);
194 194
195 if (regs) 195 if ((regs
196 notify_die(DIE_NMI, "machine check", regs, error_code, 18, 196 && notify_die(DIE_NMI, "machine check", regs, error_code,
197 SIGKILL); 197 18, SIGKILL) == NOTIFY_STOP)
198 if (!banks) 198 || !banks)
199 goto out2; 199 goto out2;
200 200
201 memset(&m, 0, sizeof(struct mce)); 201 memset(&m, 0, sizeof(struct mce));
@@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
288 * instruction which caused the MCE. 288 * instruction which caused the MCE.
289 */ 289 */
290 if (m.mcgstatus & MCG_STATUS_EIPV) 290 if (m.mcgstatus & MCG_STATUS_EIPV)
291 user_space = panicm.rip && (panicm.cs & 3); 291 user_space = panicm.ip && (panicm.cs & 3);
292 292
293 /* 293 /*
294 * If we know that the error was in user space, send a 294 * If we know that the error was in user space, send a
@@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
564 loff_t *off) 564 loff_t *off)
565{ 565{
566 unsigned long *cpu_tsc; 566 unsigned long *cpu_tsc;
567 static DECLARE_MUTEX(mce_read_sem); 567 static DEFINE_MUTEX(mce_read_mutex);
568 unsigned next; 568 unsigned next;
569 char __user *buf = ubuf; 569 char __user *buf = ubuf;
570 int i, err; 570 int i, err;
@@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
573 if (!cpu_tsc) 573 if (!cpu_tsc)
574 return -ENOMEM; 574 return -ENOMEM;
575 575
576 down(&mce_read_sem); 576 mutex_lock(&mce_read_mutex);
577 next = rcu_dereference(mcelog.next); 577 next = rcu_dereference(mcelog.next);
578 578
579 /* Only supports full reads right now */ 579 /* Only supports full reads right now */
580 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 580 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
581 up(&mce_read_sem); 581 mutex_unlock(&mce_read_mutex);
582 kfree(cpu_tsc); 582 kfree(cpu_tsc);
583 return -EINVAL; 583 return -EINVAL;
584 } 584 }
@@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
621 memset(&mcelog.entry[i], 0, sizeof(struct mce)); 621 memset(&mcelog.entry[i], 0, sizeof(struct mce));
622 } 622 }
623 } 623 }
624 up(&mce_read_sem); 624 mutex_unlock(&mce_read_mutex);
625 kfree(cpu_tsc); 625 kfree(cpu_tsc);
626 return err ? -EFAULT : buf - ubuf; 626 return err ? -EFAULT : buf - ubuf;
627} 627}
@@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
634 return 0; 634 return 0;
635} 635}
636 636
637static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, 637static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
638 unsigned long arg)
639{ 638{
640 int __user *p = (int __user *)arg; 639 int __user *p = (int __user *)arg;
641 640
@@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = {
664 .release = mce_release, 663 .release = mce_release,
665 .read = mce_read, 664 .read = mce_read,
666 .poll = mce_poll, 665 .poll = mce_poll,
667 .ioctl = mce_ioctl, 666 .unlocked_ioctl = mce_ioctl,
668}; 667};
669 668
670static struct miscdevice mce_log_device = { 669static struct miscdevice mce_log_device = {
@@ -745,7 +744,7 @@ static void mce_restart(void)
745 744
746static struct sysdev_class mce_sysclass = { 745static struct sysdev_class mce_sysclass = {
747 .resume = mce_resume, 746 .resume = mce_resume,
748 set_kset_name("machinecheck"), 747 .name = "machinecheck",
749}; 748};
750 749
751DEFINE_PER_CPU(struct sys_device, device_mce); 750DEFINE_PER_CPU(struct sys_device, device_mce);
@@ -855,8 +854,8 @@ static void mce_remove_device(unsigned int cpu)
855} 854}
856 855
857/* Get notified when a cpu comes on/off. Be hotplug friendly. */ 856/* Get notified when a cpu comes on/off. Be hotplug friendly. */
858static int 857static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
859mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 858 unsigned long action, void *hcpu)
860{ 859{
861 unsigned int cpu = (unsigned long)hcpu; 860 unsigned int cpu = (unsigned long)hcpu;
862 861
@@ -873,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
873 return NOTIFY_OK; 872 return NOTIFY_OK;
874} 873}
875 874
876static struct notifier_block mce_cpu_notifier = { 875static struct notifier_block mce_cpu_notifier __cpuinitdata = {
877 .notifier_call = mce_cpu_callback, 876 .notifier_call = mce_cpu_callback,
878}; 877};
879 878
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 752fb16a817d..32671da8184e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -65,7 +65,7 @@ static struct threshold_block threshold_defaults = {
65}; 65};
66 66
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_t cpus;
71}; 71};
@@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
118{ 118{
119 unsigned int bank, block; 119 unsigned int bank, block;
120 unsigned int cpu = smp_processor_id(); 120 unsigned int cpu = smp_processor_id();
121 u8 lvt_off;
121 u32 low = 0, high = 0, address = 0; 122 u32 low = 0, high = 0, address = 0;
122 123
123 for (bank = 0; bank < NR_BANKS; ++bank) { 124 for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
153 if (shared_bank[bank] && c->cpu_core_id) 154 if (shared_bank[bank] && c->cpu_core_id)
154 break; 155 break;
155#endif 156#endif
157 lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR,
158 APIC_EILVT_MSG_FIX, 0);
159
156 high &= ~MASK_LVTOFF_HI; 160 high &= ~MASK_LVTOFF_HI;
157 high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; 161 high |= lvt_off << 20;
158 wrmsr(address, low, high); 162 wrmsr(address, low, high);
159 163
160 setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
161 THRESHOLD_APIC_VECTOR,
162 K8_APIC_EXT_INT_MSG_FIX, 0);
163
164 threshold_defaults.address = address; 164 threshold_defaults.address = address;
165 threshold_restart_bank(&threshold_defaults, 0, 0); 165 threshold_restart_bank(&threshold_defaults, 0, 0);
166 } 166 }
@@ -432,10 +432,9 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
432 else 432 else
433 per_cpu(threshold_banks, cpu)[bank]->blocks = b; 433 per_cpu(threshold_banks, cpu)[bank]->blocks = b;
434 434
435 kobject_set_name(&b->kobj, "misc%i", block); 435 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
436 b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; 436 per_cpu(threshold_banks, cpu)[bank]->kobj,
437 b->kobj.ktype = &threshold_ktype; 437 "misc%i", block);
438 err = kobject_register(&b->kobj);
439 if (err) 438 if (err)
440 goto out_free; 439 goto out_free;
441recurse: 440recurse:
@@ -451,11 +450,14 @@ recurse:
451 if (err) 450 if (err)
452 goto out_free; 451 goto out_free;
453 452
453 if (b)
454 kobject_uevent(&b->kobj, KOBJ_ADD);
455
454 return err; 456 return err;
455 457
456out_free: 458out_free:
457 if (b) { 459 if (b) {
458 kobject_unregister(&b->kobj); 460 kobject_put(&b->kobj);
459 kfree(b); 461 kfree(b);
460 } 462 }
461 return err; 463 return err;
@@ -489,7 +491,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
489 goto out; 491 goto out;
490 492
491 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, 493 err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
492 &b->kobj, name); 494 b->kobj, name);
493 if (err) 495 if (err)
494 goto out; 496 goto out;
495 497
@@ -505,16 +507,15 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
505 goto out; 507 goto out;
506 } 508 }
507 509
508 kobject_set_name(&b->kobj, "threshold_bank%i", bank); 510 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
509 b->kobj.parent = &per_cpu(device_mce, cpu).kobj; 511 if (!b->kobj)
512 goto out_free;
513
510#ifndef CONFIG_SMP 514#ifndef CONFIG_SMP
511 b->cpus = CPU_MASK_ALL; 515 b->cpus = CPU_MASK_ALL;
512#else 516#else
513 b->cpus = per_cpu(cpu_core_map, cpu); 517 b->cpus = per_cpu(cpu_core_map, cpu);
514#endif 518#endif
515 err = kobject_register(&b->kobj);
516 if (err)
517 goto out_free;
518 519
519 per_cpu(threshold_banks, cpu)[bank] = b; 520 per_cpu(threshold_banks, cpu)[bank] = b;
520 521
@@ -531,7 +532,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
531 continue; 532 continue;
532 533
533 err = sysfs_create_link(&per_cpu(device_mce, i).kobj, 534 err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
534 &b->kobj, name); 535 b->kobj, name);
535 if (err) 536 if (err)
536 goto out; 537 goto out;
537 538
@@ -554,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
554 int err = 0; 555 int err = 0;
555 556
556 for (bank = 0; bank < NR_BANKS; ++bank) { 557 for (bank = 0; bank < NR_BANKS; ++bank) {
557 if (!(per_cpu(bank_map, cpu) & 1 << bank)) 558 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
558 continue; 559 continue;
559 err = threshold_create_bank(cpu, bank); 560 err = threshold_create_bank(cpu, bank);
560 if (err) 561 if (err)
@@ -581,7 +582,7 @@ static void deallocate_threshold_block(unsigned int cpu,
581 return; 582 return;
582 583
583 list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { 584 list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
584 kobject_unregister(&pos->kobj); 585 kobject_put(&pos->kobj);
585 list_del(&pos->miscj); 586 list_del(&pos->miscj);
586 kfree(pos); 587 kfree(pos);
587 } 588 }
@@ -627,7 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
627 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
628 629
629free_out: 630free_out:
630 kobject_unregister(&b->kobj); 631 kobject_put(b->kobj);
631 kfree(b); 632 kfree(b);
632 per_cpu(threshold_banks, cpu)[bank] = NULL; 633 per_cpu(threshold_banks, cpu)[bank] = NULL;
633} 634}
@@ -637,14 +638,14 @@ static void threshold_remove_device(unsigned int cpu)
637 unsigned int bank; 638 unsigned int bank;
638 639
639 for (bank = 0; bank < NR_BANKS; ++bank) { 640 for (bank = 0; bank < NR_BANKS; ++bank) {
640 if (!(per_cpu(bank_map, cpu) & 1 << bank)) 641 if (!(per_cpu(bank_map, cpu) & (1 << bank)))
641 continue; 642 continue;
642 threshold_remove_bank(cpu, bank); 643 threshold_remove_bank(cpu, bank);
643 } 644 }
644} 645}
645 646
646/* get notified when a cpu comes on/off */ 647/* get notified when a cpu comes on/off */
647static int threshold_cpu_callback(struct notifier_block *nfb, 648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
648 unsigned long action, void *hcpu) 649 unsigned long action, void *hcpu)
649{ 650{
650 /* cpu was unsigned int to begin with */ 651 /* cpu was unsigned int to begin with */
@@ -669,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
669 return NOTIFY_OK; 670 return NOTIFY_OK;
670} 671}
671 672
672static struct notifier_block threshold_cpu_notifier = { 673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
673 .notifier_call = threshold_cpu_callback, 674 .notifier_call = threshold_cpu_callback,
674}; 675};
675 676
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index be4dabfee1f5..cb03345554a5 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
57/* Thermal interrupt handler for this CPU setup */ 57/* Thermal interrupt handler for this CPU setup */
58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; 58static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
59 59
60fastcall void smp_thermal_interrupt(struct pt_regs *regs) 60void smp_thermal_interrupt(struct pt_regs *regs)
61{ 61{
62 irq_enter(); 62 irq_enter();
63 vendor_thermal_interrupt(regs); 63 vendor_thermal_interrupt(regs);
@@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h); 141 rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
142} 142}
143 143
144static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 144static void intel_machine_check(struct pt_regs * regs, long error_code)
145{ 145{
146 int recover=1; 146 int recover=1;
147 u32 alow, ahigh, high, low; 147 u32 alow, ahigh, high, low;
@@ -152,38 +152,41 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
152 if (mcgstl & (1<<0)) /* Recoverable ? */ 152 if (mcgstl & (1<<0)) /* Recoverable ? */
153 recover=0; 153 recover=0;
154 154
155 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 155 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
156 smp_processor_id(), mcgsth, mcgstl); 156 smp_processor_id(), mcgsth, mcgstl);
157 157
158 if (mce_num_extended_msrs > 0) { 158 if (mce_num_extended_msrs > 0) {
159 struct intel_mce_extended_msrs dbg; 159 struct intel_mce_extended_msrs dbg;
160 intel_get_extended_msrs(&dbg); 160 intel_get_extended_msrs(&dbg);
161 printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", 161 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
162 smp_processor_id(), dbg.eip, dbg.eflags); 162 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
163 printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", 163 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
164 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); 164 smp_processor_id(), dbg.eip, dbg.eflags,
165 printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", 165 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
166 dbg.esi, dbg.edi, dbg.ebp, dbg.esp); 166 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
167 } 167 }
168 168
169 for (i=0; i<nr_mce_banks; i++) { 169 for (i = 0; i < nr_mce_banks; i++) {
170 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 170 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
171 if (high & (1<<31)) { 171 if (high & (1<<31)) {
172 char misc[20];
173 char addr[24];
174 misc[0] = addr[0] = '\0';
172 if (high & (1<<29)) 175 if (high & (1<<29))
173 recover |= 1; 176 recover |= 1;
174 if (high & (1<<25)) 177 if (high & (1<<25))
175 recover |= 2; 178 recover |= 2;
176 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
177 high &= ~(1<<31); 179 high &= ~(1<<31);
178 if (high & (1<<27)) { 180 if (high & (1<<27)) {
179 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 181 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
180 printk ("[%08x%08x]", ahigh, alow); 182 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
181 } 183 }
182 if (high & (1<<26)) { 184 if (high & (1<<26)) {
183 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 185 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
184 printk (" at %08x%08x", ahigh, alow); 186 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
185 } 187 }
186 printk ("\n"); 188 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
189 smp_processor_id(), i, high, low, misc, addr);
187 } 190 }
188 } 191 }
189 192
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 94bc43d950cf..a18310aaae0c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine check handler for Pentium class Intel */ 18/* Machine check handler for Pentium class Intel */
19static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) 19static void pentium_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 u32 loaddr, hi, lotype; 21 u32 loaddr, hi, lotype;
22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); 22 rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index deeae42ce199..74342604d30e 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -16,7 +16,7 @@
16#include "mce.h" 16#include "mce.h"
17 17
18/* Machine Check Handler For PII/PIII */ 18/* Machine Check Handler For PII/PIII */
19static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) 19static void intel_machine_check(struct pt_regs * regs, long error_code)
20{ 20{
21 int recover=1; 21 int recover=1;
22 u32 alow, ahigh, high, low; 22 u32 alow, ahigh, high, low;
@@ -27,27 +27,30 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
27 if (mcgstl & (1<<0)) /* Recoverable ? */ 27 if (mcgstl & (1<<0)) /* Recoverable ? */
28 recover=0; 28 recover=0;
29 29
30 printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", 30 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
31 smp_processor_id(), mcgsth, mcgstl); 31 smp_processor_id(), mcgsth, mcgstl);
32 32
33 for (i=0; i<nr_mce_banks; i++) { 33 for (i = 0; i < nr_mce_banks; i++) {
34 rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); 34 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
35 if (high & (1<<31)) { 35 if (high & (1<<31)) {
36 char misc[20];
37 char addr[24];
38 misc[0] = addr[0] = '\0';
36 if (high & (1<<29)) 39 if (high & (1<<29))
37 recover |= 1; 40 recover |= 1;
38 if (high & (1<<25)) 41 if (high & (1<<25))
39 recover |= 2; 42 recover |= 2;
40 printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
41 high &= ~(1<<31); 43 high &= ~(1<<31);
42 if (high & (1<<27)) { 44 if (high & (1<<27)) {
43 rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); 45 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
44 printk ("[%08x%08x]", ahigh, alow); 46 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
45 } 47 }
46 if (high & (1<<26)) { 48 if (high & (1<<26)) {
47 rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); 49 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
48 printk (" at %08x%08x", ahigh, alow); 50 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
49 } 51 }
50 printk ("\n"); 52 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
53 smp_processor_id(), i, high, low, misc, addr);
51 } 54 }
52 } 55 }
53 56
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 9e424b6c293d..3d428d5afc52 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,7 +15,7 @@
15#include "mce.h" 15#include "mce.h"
16 16
17/* Machine check handler for WinChip C6 */ 17/* Machine check handler for WinChip C6 */
18static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) 18static void winchip_machine_check(struct pt_regs * regs, long error_code)
19{ 19{
20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); 20 printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
21 add_taint(TAINT_MACHINE_CHECK); 21 add_taint(TAINT_MACHINE_CHECK);
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 0949cdbf848a..ee2331b0e58f 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base,
53 <base> The base address of the region. 53 <base> The base address of the region.
54 <size> The size of the region. If this is 0 the region is disabled. 54 <size> The size of the region. If this is 0 the region is disabled.
55 <type> The type of the region. 55 <type> The type of the region.
56 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
57 be done externally.
58 [RETURNS] Nothing. 56 [RETURNS] Nothing.
59*/ 57*/
60{ 58{
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9964be3de2b7..ff14c320040c 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -4,10 +4,9 @@
4#include <asm/msr.h> 4#include <asm/msr.h>
5#include <asm/io.h> 5#include <asm/io.h>
6#include <asm/processor-cyrix.h> 6#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h>
7#include "mtrr.h" 8#include "mtrr.h"
8 9
9int arr3_protected;
10
11static void 10static void
12cyrix_get_arr(unsigned int reg, unsigned long *base, 11cyrix_get_arr(unsigned int reg, unsigned long *base,
13 unsigned long *size, mtrr_type * type) 12 unsigned long *size, mtrr_type * type)
@@ -98,8 +97,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
98 case 4: 97 case 4:
99 return replace_reg; 98 return replace_reg;
100 case 3: 99 case 3:
101 if (arr3_protected)
102 break;
103 case 2: 100 case 2:
104 case 1: 101 case 1:
105 case 0: 102 case 0:
@@ -114,8 +111,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
114 } else { 111 } else {
115 for (i = 0; i < 7; i++) { 112 for (i = 0; i < 7; i++) {
116 cyrix_get_arr(i, &lbase, &lsize, &ltype); 113 cyrix_get_arr(i, &lbase, &lsize, &ltype);
117 if ((i == 3) && arr3_protected)
118 continue;
119 if (lsize == 0) 114 if (lsize == 0)
120 return i; 115 return i;
121 } 116 }
@@ -142,7 +137,7 @@ static void prepare_set(void)
142 137
143 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 138 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
144 a side-effect */ 139 a side-effect */
145 cr0 = read_cr0() | 0x40000000; 140 cr0 = read_cr0() | X86_CR0_CD;
146 wbinvd(); 141 wbinvd();
147 write_cr0(cr0); 142 write_cr0(cr0);
148 wbinvd(); 143 wbinvd();
@@ -259,107 +254,6 @@ static void cyrix_set_all(void)
259 post_set(); 254 post_set();
260} 255}
261 256
262#if 0
263/*
264 * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection
265 * with the SMM (System Management Mode) mode. So we need the following:
266 * Check whether SMI_LOCK (CCR3 bit 0) is set
267 * if it is set, write a warning message: ARR3 cannot be changed!
268 * (it cannot be changed until the next processor reset)
269 * if it is reset, then we can change it, set all the needed bits:
270 * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset)
271 * - disable access to SMM memory (CCR1 bit 2 reset)
272 * - disable SMM mode (CCR1 bit 1 reset)
273 * - disable write protection of ARR3 (CCR6 bit 1 reset)
274 * - (maybe) disable ARR3
275 * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set)
276 */
277static void __init
278cyrix_arr_init(void)
279{
280 struct set_mtrr_context ctxt;
281 unsigned char ccr[7];
282 int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 };
283#ifdef CONFIG_SMP
284 int i;
285#endif
286
287 /* flush cache and enable MAPEN */
288 set_mtrr_prepare_save(&ctxt);
289 set_mtrr_cache_disable(&ctxt);
290
291 /* Save all CCRs locally */
292 ccr[0] = getCx86(CX86_CCR0);
293 ccr[1] = getCx86(CX86_CCR1);
294 ccr[2] = getCx86(CX86_CCR2);
295 ccr[3] = ctxt.ccr3;
296 ccr[4] = getCx86(CX86_CCR4);
297 ccr[5] = getCx86(CX86_CCR5);
298 ccr[6] = getCx86(CX86_CCR6);
299
300 if (ccr[3] & 1) {
301 ccrc[3] = 1;
302 arr3_protected = 1;
303 } else {
304 /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and
305 * access to SMM memory through ARR3 (bit 7).
306 */
307 if (ccr[1] & 0x80) {
308 ccr[1] &= 0x7f;
309 ccrc[1] |= 0x80;
310 }
311 if (ccr[1] & 0x04) {
312 ccr[1] &= 0xfb;
313 ccrc[1] |= 0x04;
314 }
315 if (ccr[1] & 0x02) {
316 ccr[1] &= 0xfd;
317 ccrc[1] |= 0x02;
318 }
319 arr3_protected = 0;
320 if (ccr[6] & 0x02) {
321 ccr[6] &= 0xfd;
322 ccrc[6] = 1; /* Disable write protection of ARR3 */
323 setCx86(CX86_CCR6, ccr[6]);
324 }
325 /* Disable ARR3. This is safe now that we disabled SMM. */
326 /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */
327 }
328 /* If we changed CCR1 in memory, change it in the processor, too. */
329 if (ccrc[1])
330 setCx86(CX86_CCR1, ccr[1]);
331
332 /* Enable ARR usage by the processor */
333 if (!(ccr[5] & 0x20)) {
334 ccr[5] |= 0x20;
335 ccrc[5] = 1;
336 setCx86(CX86_CCR5, ccr[5]);
337 }
338#ifdef CONFIG_SMP
339 for (i = 0; i < 7; i++)
340 ccr_state[i] = ccr[i];
341 for (i = 0; i < 8; i++)
342 cyrix_get_arr(i,
343 &arr_state[i].base, &arr_state[i].size,
344 &arr_state[i].type);
345#endif
346
347 set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */
348
349 if (ccrc[5])
350 printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n");
351 if (ccrc[3])
352 printk(KERN_INFO "mtrr: ARR3 cannot be changed\n");
353/*
354 if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n");
355 if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n");
356 if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n");
357*/
358 if (ccrc[6])
359 printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n");
360}
361#endif
362
363static struct mtrr_ops cyrix_mtrr_ops = { 257static struct mtrr_ops cyrix_mtrr_ops = {
364 .vendor = X86_VENDOR_CYRIX, 258 .vendor = X86_VENDOR_CYRIX,
365// .init = cyrix_arr_init, 259// .init = cyrix_arr_init,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 992f08dfbb6c..103d61a59b19 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -9,11 +9,12 @@
9#include <asm/msr.h> 9#include <asm/msr.h>
10#include <asm/system.h> 10#include <asm/system.h>
11#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
12#include <asm/processor-flags.h>
12#include <asm/tlbflush.h> 13#include <asm/tlbflush.h>
13#include "mtrr.h" 14#include "mtrr.h"
14 15
15struct mtrr_state { 16struct mtrr_state {
16 struct mtrr_var_range *var_ranges; 17 struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
17 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 18 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
18 unsigned char enabled; 19 unsigned char enabled;
19 unsigned char have_fixed; 20 unsigned char have_fixed;
@@ -85,12 +86,6 @@ void __init get_mtrr_state(void)
85 struct mtrr_var_range *vrs; 86 struct mtrr_var_range *vrs;
86 unsigned lo, dummy; 87 unsigned lo, dummy;
87 88
88 if (!mtrr_state.var_ranges) {
89 mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
90 GFP_KERNEL);
91 if (!mtrr_state.var_ranges)
92 return;
93 }
94 vrs = mtrr_state.var_ranges; 89 vrs = mtrr_state.var_ranges;
95 90
96 rdmsr(MTRRcap_MSR, lo, dummy); 91 rdmsr(MTRRcap_MSR, lo, dummy);
@@ -188,7 +183,7 @@ static inline void k8_enable_fixed_iorrs(void)
188 * \param changed pointer which indicates whether the MTRR needed to be changed 183 * \param changed pointer which indicates whether the MTRR needed to be changed
189 * \param msrwords pointer to the MSR values which the MSR should have 184 * \param msrwords pointer to the MSR values which the MSR should have
190 */ 185 */
191static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) 186static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
192{ 187{
193 unsigned lo, hi; 188 unsigned lo, hi;
194 189
@@ -200,7 +195,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords)
200 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) 195 ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
201 k8_enable_fixed_iorrs(); 196 k8_enable_fixed_iorrs();
202 mtrr_wrmsr(msr, msrwords[0], msrwords[1]); 197 mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
203 *changed = TRUE; 198 *changed = true;
204 } 199 }
205} 200}
206 201
@@ -260,7 +255,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
260static int set_fixed_ranges(mtrr_type * frs) 255static int set_fixed_ranges(mtrr_type * frs)
261{ 256{
262 unsigned long long *saved = (unsigned long long *) frs; 257 unsigned long long *saved = (unsigned long long *) frs;
263 int changed = FALSE; 258 bool changed = false;
264 int block=-1, range; 259 int block=-1, range;
265 260
266 while (fixed_range_blocks[++block].ranges) 261 while (fixed_range_blocks[++block].ranges)
@@ -273,17 +268,17 @@ static int set_fixed_ranges(mtrr_type * frs)
273 268
274/* Set the MSR pair relating to a var range. Returns TRUE if 269/* Set the MSR pair relating to a var range. Returns TRUE if
275 changes are made */ 270 changes are made */
276static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) 271static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
277{ 272{
278 unsigned int lo, hi; 273 unsigned int lo, hi;
279 int changed = FALSE; 274 bool changed = false;
280 275
281 rdmsr(MTRRphysBase_MSR(index), lo, hi); 276 rdmsr(MTRRphysBase_MSR(index), lo, hi);
282 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) 277 if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
283 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 278 || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
284 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 279 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
285 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); 280 mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi);
286 changed = TRUE; 281 changed = true;
287 } 282 }
288 283
289 rdmsr(MTRRphysMask_MSR(index), lo, hi); 284 rdmsr(MTRRphysMask_MSR(index), lo, hi);
@@ -292,7 +287,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
292 || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != 287 || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) !=
293 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { 288 (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) {
294 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); 289 mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
295 changed = TRUE; 290 changed = true;
296 } 291 }
297 return changed; 292 return changed;
298} 293}
@@ -350,7 +345,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
350 spin_lock(&set_atomicity_lock); 345 spin_lock(&set_atomicity_lock);
351 346
352 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 347 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
353 cr0 = read_cr0() | 0x40000000; /* set CD flag */ 348 cr0 = read_cr0() | X86_CR0_CD;
354 write_cr0(cr0); 349 write_cr0(cr0);
355 wbinvd(); 350 wbinvd();
356 351
@@ -417,8 +412,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base,
417 <base> The base address of the region. 412 <base> The base address of the region.
418 <size> The size of the region. If this is 0 the region is disabled. 413 <size> The size of the region. If this is 0 the region is disabled.
419 <type> The type of the region. 414 <type> The type of the region.
420 <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
421 be done externally.
422 [RETURNS] Nothing. 415 [RETURNS] Nothing.
423*/ 416*/
424{ 417{
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index c7d8f1756745..91e150acb46c 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -11,10 +11,6 @@
11#include <asm/mtrr.h> 11#include <asm/mtrr.h>
12#include "mtrr.h" 12#include "mtrr.h"
13 13
14/* RED-PEN: this is accessed without any locking */
15extern unsigned int *usage_table;
16
17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 14#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 15
20static const char *const mtrr_strings[MTRR_NUM_TYPES] = 16static const char *const mtrr_strings[MTRR_NUM_TYPES] =
@@ -37,7 +33,7 @@ const char *mtrr_attrib_to_str(int x)
37 33
38static int 34static int
39mtrr_file_add(unsigned long base, unsigned long size, 35mtrr_file_add(unsigned long base, unsigned long size,
40 unsigned int type, char increment, struct file *file, int page) 36 unsigned int type, bool increment, struct file *file, int page)
41{ 37{
42 int reg, max; 38 int reg, max;
43 unsigned int *fcount = FILE_FCOUNT(file); 39 unsigned int *fcount = FILE_FCOUNT(file);
@@ -55,7 +51,7 @@ mtrr_file_add(unsigned long base, unsigned long size,
55 base >>= PAGE_SHIFT; 51 base >>= PAGE_SHIFT;
56 size >>= PAGE_SHIFT; 52 size >>= PAGE_SHIFT;
57 } 53 }
58 reg = mtrr_add_page(base, size, type, 1); 54 reg = mtrr_add_page(base, size, type, true);
59 if (reg >= 0) 55 if (reg >= 0)
60 ++fcount[reg]; 56 ++fcount[reg];
61 return reg; 57 return reg;
@@ -141,7 +137,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
141 size >>= PAGE_SHIFT; 137 size >>= PAGE_SHIFT;
142 err = 138 err =
143 mtrr_add_page((unsigned long) base, (unsigned long) size, i, 139 mtrr_add_page((unsigned long) base, (unsigned long) size, i,
144 1); 140 true);
145 if (err < 0) 141 if (err < 0)
146 return err; 142 return err;
147 return len; 143 return len;
@@ -217,7 +213,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
217 if (!capable(CAP_SYS_ADMIN)) 213 if (!capable(CAP_SYS_ADMIN))
218 return -EPERM; 214 return -EPERM;
219 err = 215 err =
220 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, 216 mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
221 file, 0); 217 file, 0);
222 break; 218 break;
223 case MTRRIOC_SET_ENTRY: 219 case MTRRIOC_SET_ENTRY:
@@ -226,7 +222,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
226#endif 222#endif
227 if (!capable(CAP_SYS_ADMIN)) 223 if (!capable(CAP_SYS_ADMIN))
228 return -EPERM; 224 return -EPERM;
229 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); 225 err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
230 break; 226 break;
231 case MTRRIOC_DEL_ENTRY: 227 case MTRRIOC_DEL_ENTRY:
232#ifdef CONFIG_COMPAT 228#ifdef CONFIG_COMPAT
@@ -270,7 +266,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
270 if (!capable(CAP_SYS_ADMIN)) 266 if (!capable(CAP_SYS_ADMIN))
271 return -EPERM; 267 return -EPERM;
272 err = 268 err =
273 mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, 269 mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
274 file, 1); 270 file, 1);
275 break; 271 break;
276 case MTRRIOC_SET_PAGE_ENTRY: 272 case MTRRIOC_SET_PAGE_ENTRY:
@@ -279,7 +275,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
279#endif 275#endif
280 if (!capable(CAP_SYS_ADMIN)) 276 if (!capable(CAP_SYS_ADMIN))
281 return -EPERM; 277 return -EPERM;
282 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); 278 err =
279 mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
283 break; 280 break;
284 case MTRRIOC_DEL_PAGE_ENTRY: 281 case MTRRIOC_DEL_PAGE_ENTRY:
285#ifdef CONFIG_COMPAT 282#ifdef CONFIG_COMPAT
@@ -396,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
396 for (i = 0; i < max; i++) { 393 for (i = 0; i < max; i++) {
397 mtrr_if->get(i, &base, &size, &type); 394 mtrr_if->get(i, &base, &size, &type);
398 if (size == 0) 395 if (size == 0)
399 usage_table[i] = 0; 396 mtrr_usage_table[i] = 0;
400 else { 397 else {
401 if (size < (0x100000 >> PAGE_SHIFT)) { 398 if (size < (0x100000 >> PAGE_SHIFT)) {
402 /* less than 1MB */ 399 /* less than 1MB */
@@ -410,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
410 len += seq_printf(seq, 407 len += seq_printf(seq,
411 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
412 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
413 mtrr_attrib_to_str(type), usage_table[i]); 410 mtrr_attrib_to_str(type), mtrr_usage_table[i]);
414 } 411 }
415 } 412 }
416 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 3b20613325dc..b6e136f23d3d 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -38,8 +38,8 @@
38#include <linux/cpu.h> 38#include <linux/cpu.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40 40
41#include <asm/e820.h>
41#include <asm/mtrr.h> 42#include <asm/mtrr.h>
42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr.h> 45#include <asm/msr.h>
@@ -47,7 +47,7 @@
47 47
48u32 num_var_ranges = 0; 48u32 num_var_ranges = 0;
49 49
50unsigned int *usage_table; 50unsigned int mtrr_usage_table[MAX_VAR_RANGES];
51static DEFINE_MUTEX(mtrr_mutex); 51static DEFINE_MUTEX(mtrr_mutex);
52 52
53u64 size_or_mask, size_and_mask; 53u64 size_or_mask, size_and_mask;
@@ -59,12 +59,6 @@ struct mtrr_ops * mtrr_if = NULL;
59static void set_mtrr(unsigned int reg, unsigned long base, 59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type); 60 unsigned long size, mtrr_type type);
61 61
62#ifndef CONFIG_X86_64
63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
67
68void set_mtrr_ops(struct mtrr_ops * ops) 62void set_mtrr_ops(struct mtrr_ops * ops)
69{ 63{
70 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 64 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
@@ -121,13 +115,8 @@ static void __init init_table(void)
121 int i, max; 115 int i, max;
122 116
123 max = num_var_ranges; 117 max = num_var_ranges;
124 if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
125 == NULL) {
126 printk(KERN_ERR "mtrr: could not allocate\n");
127 return;
128 }
129 for (i = 0; i < max; i++) 118 for (i = 0; i < max; i++)
130 usage_table[i] = 1; 119 mtrr_usage_table[i] = 1;
131} 120}
132 121
133struct set_mtrr_data { 122struct set_mtrr_data {
@@ -311,7 +300,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
311 */ 300 */
312 301
313int mtrr_add_page(unsigned long base, unsigned long size, 302int mtrr_add_page(unsigned long base, unsigned long size,
314 unsigned int type, char increment) 303 unsigned int type, bool increment)
315{ 304{
316 int i, replace, error; 305 int i, replace, error;
317 mtrr_type ltype; 306 mtrr_type ltype;
@@ -349,7 +338,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
349 replace = -1; 338 replace = -1;
350 339
351 /* No CPU hotplug when we change MTRR entries */ 340 /* No CPU hotplug when we change MTRR entries */
352 lock_cpu_hotplug(); 341 get_online_cpus();
353 /* Search for existing MTRR */ 342 /* Search for existing MTRR */
354 mutex_lock(&mtrr_mutex); 343 mutex_lock(&mtrr_mutex);
355 for (i = 0; i < num_var_ranges; ++i) { 344 for (i = 0; i < num_var_ranges; ++i) {
@@ -383,7 +372,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
383 goto out; 372 goto out;
384 } 373 }
385 if (increment) 374 if (increment)
386 ++usage_table[i]; 375 ++mtrr_usage_table[i];
387 error = i; 376 error = i;
388 goto out; 377 goto out;
389 } 378 }
@@ -391,13 +380,15 @@ int mtrr_add_page(unsigned long base, unsigned long size,
391 i = mtrr_if->get_free_region(base, size, replace); 380 i = mtrr_if->get_free_region(base, size, replace);
392 if (i >= 0) { 381 if (i >= 0) {
393 set_mtrr(i, base, size, type); 382 set_mtrr(i, base, size, type);
394 if (likely(replace < 0)) 383 if (likely(replace < 0)) {
395 usage_table[i] = 1; 384 mtrr_usage_table[i] = 1;
396 else { 385 } else {
397 usage_table[i] = usage_table[replace] + !!increment; 386 mtrr_usage_table[i] = mtrr_usage_table[replace];
387 if (increment)
388 mtrr_usage_table[i]++;
398 if (unlikely(replace != i)) { 389 if (unlikely(replace != i)) {
399 set_mtrr(replace, 0, 0, 0); 390 set_mtrr(replace, 0, 0, 0);
400 usage_table[replace] = 0; 391 mtrr_usage_table[replace] = 0;
401 } 392 }
402 } 393 }
403 } else 394 } else
@@ -405,7 +396,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
405 error = i; 396 error = i;
406 out: 397 out:
407 mutex_unlock(&mtrr_mutex); 398 mutex_unlock(&mtrr_mutex);
408 unlock_cpu_hotplug(); 399 put_online_cpus();
409 return error; 400 return error;
410} 401}
411 402
@@ -460,7 +451,7 @@ static int mtrr_check(unsigned long base, unsigned long size)
460 451
461int 452int
462mtrr_add(unsigned long base, unsigned long size, unsigned int type, 453mtrr_add(unsigned long base, unsigned long size, unsigned int type,
463 char increment) 454 bool increment)
464{ 455{
465 if (mtrr_check(base, size)) 456 if (mtrr_check(base, size))
466 return -EINVAL; 457 return -EINVAL;
@@ -495,7 +486,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
495 486
496 max = num_var_ranges; 487 max = num_var_ranges;
497 /* No CPU hotplug when we change MTRR entries */ 488 /* No CPU hotplug when we change MTRR entries */
498 lock_cpu_hotplug(); 489 get_online_cpus();
499 mutex_lock(&mtrr_mutex); 490 mutex_lock(&mtrr_mutex);
500 if (reg < 0) { 491 if (reg < 0) {
501 /* Search for existing MTRR */ 492 /* Search for existing MTRR */
@@ -516,27 +507,21 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
516 printk(KERN_WARNING "mtrr: register: %d too big\n", reg); 507 printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
517 goto out; 508 goto out;
518 } 509 }
519 if (is_cpu(CYRIX) && !use_intel()) {
520 if ((reg == 3) && arr3_protected) {
521 printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
522 goto out;
523 }
524 }
525 mtrr_if->get(reg, &lbase, &lsize, &ltype); 510 mtrr_if->get(reg, &lbase, &lsize, &ltype);
526 if (lsize < 1) { 511 if (lsize < 1) {
527 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); 512 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
528 goto out; 513 goto out;
529 } 514 }
530 if (usage_table[reg] < 1) { 515 if (mtrr_usage_table[reg] < 1) {
531 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); 516 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
532 goto out; 517 goto out;
533 } 518 }
534 if (--usage_table[reg] < 1) 519 if (--mtrr_usage_table[reg] < 1)
535 set_mtrr(reg, 0, 0, 0); 520 set_mtrr(reg, 0, 0, 0);
536 error = reg; 521 error = reg;
537 out: 522 out:
538 mutex_unlock(&mtrr_mutex); 523 mutex_unlock(&mtrr_mutex);
539 unlock_cpu_hotplug(); 524 put_online_cpus();
540 return error; 525 return error;
541} 526}
542/** 527/**
@@ -569,10 +554,6 @@ EXPORT_SYMBOL(mtrr_del);
569 * These should be called implicitly, but we can't yet until all the initcall 554 * These should be called implicitly, but we can't yet until all the initcall
570 * stuff is done... 555 * stuff is done...
571 */ 556 */
572extern void amd_init_mtrr(void);
573extern void cyrix_init_mtrr(void);
574extern void centaur_init_mtrr(void);
575
576static void __init init_ifs(void) 557static void __init init_ifs(void)
577{ 558{
578#ifndef CONFIG_X86_64 559#ifndef CONFIG_X86_64
@@ -591,16 +572,11 @@ struct mtrr_value {
591 unsigned long lsize; 572 unsigned long lsize;
592}; 573};
593 574
594static struct mtrr_value * mtrr_state; 575static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
595 576
596static int mtrr_save(struct sys_device * sysdev, pm_message_t state) 577static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
597{ 578{
598 int i; 579 int i;
599 int size = num_var_ranges * sizeof(struct mtrr_value);
600
601 mtrr_state = kzalloc(size,GFP_ATOMIC);
602 if (!mtrr_state)
603 return -ENOMEM;
604 580
605 for (i = 0; i < num_var_ranges; i++) { 581 for (i = 0; i < num_var_ranges; i++) {
606 mtrr_if->get(i, 582 mtrr_if->get(i,
@@ -622,7 +598,6 @@ static int mtrr_restore(struct sys_device * sysdev)
622 mtrr_state[i].lsize, 598 mtrr_state[i].lsize,
623 mtrr_state[i].ltype); 599 mtrr_state[i].ltype);
624 } 600 }
625 kfree(mtrr_state);
626 return 0; 601 return 0;
627} 602}
628 603
@@ -633,6 +608,111 @@ static struct sysdev_driver mtrr_sysdev_driver = {
633 .resume = mtrr_restore, 608 .resume = mtrr_restore,
634}; 609};
635 610
611static int disable_mtrr_trim;
612
613static int __init disable_mtrr_trim_setup(char *str)
614{
615 disable_mtrr_trim = 1;
616 return 0;
617}
618early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
619
620/*
621 * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
622 * for memory >4GB. Check for that here.
623 * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
624 * apply to are wrong, but so far we don't know of any such case in the wild.
625 */
626#define Tom2Enabled (1U << 21)
627#define Tom2ForceMemTypeWB (1U << 22)
628
629static __init int amd_special_default_mtrr(void)
630{
631 u32 l, h;
632
633 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
634 return 0;
635 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
636 return 0;
637 /* In case some hypervisor doesn't pass SYSCFG through */
638 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
639 return 0;
640 /*
641 * Memory between 4GB and top of mem is forced WB by this magic bit.
642 * Reserved before K8RevF, but should be zero there.
643 */
644 if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
645 (Tom2Enabled | Tom2ForceMemTypeWB))
646 return 1;
647 return 0;
648}
649
650/**
651 * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
652 *
653 * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
654 * memory configurations. This routine checks that the highest MTRR matches
655 * the end of memory, to make sure the MTRRs having a write back type cover
656 * all of the memory the kernel is intending to use. If not, it'll trim any
657 * memory off the end by adjusting end_pfn, removing it from the kernel's
658 * allocation pools, warning the user with an obnoxious message.
659 */
660int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
661{
662 unsigned long i, base, size, highest_pfn = 0, def, dummy;
663 mtrr_type type;
664 u64 trim_start, trim_size;
665
666 /*
667 * Make sure we only trim uncachable memory on machines that
668 * support the Intel MTRR architecture:
669 */
670 if (!is_cpu(INTEL) || disable_mtrr_trim)
671 return 0;
672 rdmsr(MTRRdefType_MSR, def, dummy);
673 def &= 0xff;
674 if (def != MTRR_TYPE_UNCACHABLE)
675 return 0;
676
677 if (amd_special_default_mtrr())
678 return 0;
679
680 /* Find highest cached pfn */
681 for (i = 0; i < num_var_ranges; i++) {
682 mtrr_if->get(i, &base, &size, &type);
683 if (type != MTRR_TYPE_WRBACK)
684 continue;
685 if (highest_pfn < base + size)
686 highest_pfn = base + size;
687 }
688
689 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
690 if (!highest_pfn) {
691 printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n");
692 WARN_ON(1);
693 return 0;
694 }
695
696 if (highest_pfn < end_pfn) {
697 printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
698 " all of memory, losing %luMB of RAM.\n",
699 (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
700
701 WARN_ON(1);
702
703 printk(KERN_INFO "update e820 for mtrr\n");
704 trim_start = highest_pfn;
705 trim_start <<= PAGE_SHIFT;
706 trim_size = end_pfn;
707 trim_size <<= PAGE_SHIFT;
708 trim_size -= trim_start;
709 add_memory_region(trim_start, trim_size, E820_RESERVED);
710 update_e820();
711 return 1;
712 }
713
714 return 0;
715}
636 716
637/** 717/**
638 * mtrr_bp_init - initialize mtrrs on the boot CPU 718 * mtrr_bp_init - initialize mtrrs on the boot CPU
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 289dfe6030e3..2cc77eb6fea3 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -2,10 +2,8 @@
2 * local mtrr defines. 2 * local mtrr defines.
3 */ 3 */
4 4
5#ifndef TRUE 5#include <linux/types.h>
6#define TRUE 1 6#include <linux/stddef.h>
7#define FALSE 0
8#endif
9 7
10#define MTRRcap_MSR 0x0fe 8#define MTRRcap_MSR 0x0fe
11#define MTRRdefType_MSR 0x2ff 9#define MTRRdefType_MSR 0x2ff
@@ -14,6 +12,7 @@
14#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) 12#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
15 13
16#define NUM_FIXED_RANGES 88 14#define NUM_FIXED_RANGES 88
15#define MAX_VAR_RANGES 256
17#define MTRRfix64K_00000_MSR 0x250 16#define MTRRfix64K_00000_MSR 0x250
18#define MTRRfix16K_80000_MSR 0x258 17#define MTRRfix16K_80000_MSR 0x258
19#define MTRRfix16K_A0000_MSR 0x259 18#define MTRRfix16K_A0000_MSR 0x259
@@ -34,6 +33,8 @@
34 an 8 bit field: */ 33 an 8 bit field: */
35typedef u8 mtrr_type; 34typedef u8 mtrr_type;
36 35
36extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
37
37struct mtrr_ops { 38struct mtrr_ops {
38 u32 vendor; 39 u32 vendor;
39 u32 use_intel_if; 40 u32 use_intel_if;
@@ -96,3 +97,7 @@ void mtrr_state_warn(void);
96const char *mtrr_attrib_to_str(int x); 97const char *mtrr_attrib_to_str(int x);
97void mtrr_wrmsr(unsigned, unsigned, unsigned); 98void mtrr_wrmsr(unsigned, unsigned, unsigned);
98 99
100/* CPU specific mtrr init functions */
101int amd_init_mtrr(void);
102int cyrix_init_mtrr(void);
103int centaur_init_mtrr(void);
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 49e20c2afcdf..9f8ba923d1c9 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -4,6 +4,7 @@
4#include <asm/mtrr.h> 4#include <asm/mtrr.h>
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include <asm/processor-cyrix.h> 6#include <asm/processor-cyrix.h>
7#include <asm/processor-flags.h>
7#include "mtrr.h" 8#include "mtrr.h"
8 9
9 10
@@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
25 26
26 /* Disable and flush caches. Note that wbinvd flushes the TLBs as 27 /* Disable and flush caches. Note that wbinvd flushes the TLBs as
27 a side-effect */ 28 a side-effect */
28 cr0 = read_cr0() | 0x40000000; 29 cr0 = read_cr0() | X86_CR0_CD;
29 wbinvd(); 30 wbinvd();
30 write_cr0(cr0); 31 write_cr0(cr0);
31 wbinvd(); 32 wbinvd();
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index c02541e6e653..9b838324b818 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr)
167 clear_bit(counter, evntsel_nmi_owner); 167 clear_bit(counter, evntsel_nmi_owner);
168} 168}
169 169
170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
171EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 170EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
172EXPORT_SYMBOL(reserve_perfctr_nmi); 171EXPORT_SYMBOL(reserve_perfctr_nmi);
173EXPORT_SYMBOL(release_perfctr_nmi); 172EXPORT_SYMBOL(release_perfctr_nmi);
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 3900e46d66db..af11d31dce0a 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -10,80 +10,6 @@
10 */ 10 */
11static int show_cpuinfo(struct seq_file *m, void *v) 11static int show_cpuinfo(struct seq_file *m, void *v)
12{ 12{
13 /*
14 * These flag bits must match the definitions in <asm/cpufeature.h>.
15 * NULL means this bit is undefined or reserved; either way it doesn't
16 * have meaning as far as Linux is concerned. Note that it's important
17 * to realize there is a difference between this table and CPUID -- if
18 * applications want to get the raw CPUID data, they should access
19 * /dev/cpu/<cpu_nr>/cpuid instead.
20 */
21 static const char * const x86_cap_flags[] = {
22 /* Intel-defined */
23 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
24 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
25 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
26 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
27
28 /* AMD-defined */
29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
32 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
33 "3dnowext", "3dnow",
34
35 /* Transmeta-defined */
36 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
37 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
38 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
39 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
40
41 /* Other (Linux-defined) */
42 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
43 NULL, NULL, NULL, NULL,
44 "constant_tsc", "up", NULL, "arch_perfmon",
45 "pebs", "bts", NULL, "sync_rdtsc",
46 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
47 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
48
49 /* Intel-defined (#2) */
50 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
51 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
52 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
53 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
54
55 /* VIA/Cyrix/Centaur-defined */
56 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
57 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
58 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
59 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
60
61 /* AMD-defined (#2) */
62 "lahf_lm", "cmp_legacy", "svm", "extapic",
63 "cr8_legacy", "abm", "sse4a", "misalignsse",
64 "3dnowprefetch", "osvw", "ibs", "sse5",
65 "skinit", "wdt", NULL, NULL,
66 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68
69 /* Auxiliary (Linux-defined) */
70 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
71 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
72 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
73 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
74 };
75 static const char * const x86_power_flags[] = {
76 "ts", /* temperature sensor */
77 "fid", /* frequency id control */
78 "vid", /* voltage id control */
79 "ttp", /* thermal trip */
80 "tm",
81 "stc",
82 "100mhzsteps",
83 "hwpstate",
84 "", /* constant_tsc - moved to flags */
85 /* nothing */
86 };
87 struct cpuinfo_x86 *c = v; 13 struct cpuinfo_x86 *c = v;
88 int i, n = 0; 14 int i, n = 0;
89 int fpu_exception; 15 int fpu_exception;
@@ -188,7 +114,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
188static void c_stop(struct seq_file *m, void *v) 114static void c_stop(struct seq_file *m, void *v)
189{ 115{
190} 116}
191struct seq_operations cpuinfo_op = { 117const struct seq_operations cpuinfo_op = {
192 .start = c_start, 118 .start = c_start,
193 .next = c_next, 119 .next = c_next,
194 .stop = c_stop, 120 .stop = c_stop,
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 05c9936a16cc..288e7a6598ac 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -1,6 +1,6 @@
1/* ----------------------------------------------------------------------- * 1/* ----------------------------------------------------------------------- *
2 * 2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved 3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -17,6 +17,10 @@
17 * and then read in chunks of 16 bytes. A larger size means multiple 17 * and then read in chunks of 16 bytes. A larger size means multiple
18 * reads of consecutive levels. 18 * reads of consecutive levels.
19 * 19 *
20 * The lower 32 bits of the file position is used as the incoming %eax,
21 * and the upper 32 bits of the file position as the incoming %ecx,
22 * the latter intended for "counting" eax levels like eax=4.
23 *
20 * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on 24 * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on
21 * an SMP box will direct the access to CPU %d. 25 * an SMP box will direct the access to CPU %d.
22 */ 26 */
@@ -43,35 +47,24 @@
43 47
44static struct class *cpuid_class; 48static struct class *cpuid_class;
45 49
46struct cpuid_command { 50struct cpuid_regs {
47 u32 reg; 51 u32 eax, ebx, ecx, edx;
48 u32 *data;
49}; 52};
50 53
51static void cpuid_smp_cpuid(void *cmd_block) 54static void cpuid_smp_cpuid(void *cmd_block)
52{ 55{
53 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; 56 struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block;
54
55 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
56 &cmd->data[3]);
57}
58
59static inline void do_cpuid(int cpu, u32 reg, u32 * data)
60{
61 struct cpuid_command cmd;
62
63 cmd.reg = reg;
64 cmd.data = data;
65 57
66 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); 58 cpuid_count(cmd->eax, cmd->ecx,
59 &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
67} 60}
68 61
69static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) 62static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
70{ 63{
71 loff_t ret; 64 loff_t ret;
65 struct inode *inode = file->f_mapping->host;
72 66
73 lock_kernel(); 67 mutex_lock(&inode->i_mutex);
74
75 switch (orig) { 68 switch (orig) {
76 case 0: 69 case 0:
77 file->f_pos = offset; 70 file->f_pos = offset;
@@ -84,8 +77,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
84 default: 77 default:
85 ret = -EINVAL; 78 ret = -EINVAL;
86 } 79 }
87 80 mutex_unlock(&inode->i_mutex);
88 unlock_kernel();
89 return ret; 81 return ret;
90} 82}
91 83
@@ -93,19 +85,21 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
93 size_t count, loff_t * ppos) 85 size_t count, loff_t * ppos)
94{ 86{
95 char __user *tmp = buf; 87 char __user *tmp = buf;
96 u32 data[4]; 88 struct cpuid_regs cmd;
97 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 89 int cpu = iminor(file->f_path.dentry->d_inode);
90 u64 pos = *ppos;
99 91
100 if (count % 16) 92 if (count % 16)
101 return -EINVAL; /* Invalid chunk size */ 93 return -EINVAL; /* Invalid chunk size */
102 94
103 for (; count; count -= 16) { 95 for (; count; count -= 16) {
104 do_cpuid(cpu, reg, data); 96 cmd.eax = pos;
105 if (copy_to_user(tmp, &data, 16)) 97 cmd.ecx = pos >> 32;
98 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
99 if (copy_to_user(tmp, &cmd, 16))
106 return -EFAULT; 100 return -EFAULT;
107 tmp += 16; 101 tmp += 16;
108 *ppos = reg++; 102 *ppos = ++pos;
109 } 103 }
110 104
111 return tmp - buf; 105 return tmp - buf;
@@ -157,20 +151,20 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
157 151
158 switch (action) { 152 switch (action) {
159 case CPU_UP_PREPARE: 153 case CPU_UP_PREPARE:
160 case CPU_UP_PREPARE_FROZEN:
161 err = cpuid_device_create(cpu); 154 err = cpuid_device_create(cpu);
162 break; 155 break;
163 case CPU_UP_CANCELED: 156 case CPU_UP_CANCELED:
164 case CPU_UP_CANCELED_FROZEN:
165 case CPU_DEAD: 157 case CPU_DEAD:
166 case CPU_DEAD_FROZEN:
167 cpuid_device_destroy(cpu); 158 cpuid_device_destroy(cpu);
168 break; 159 break;
160 case CPU_UP_CANCELED_FROZEN:
161 destroy_suspended_device(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
162 break;
169 } 163 }
170 return err ? NOTIFY_BAD : NOTIFY_OK; 164 return err ? NOTIFY_BAD : NOTIFY_OK;
171} 165}
172 166
173static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = 167static struct notifier_block __refdata cpuid_class_cpu_notifier =
174{ 168{
175 .notifier_call = cpuid_class_cpu_callback, 169 .notifier_call = cpuid_class_cpu_callback,
176}; 170};
@@ -193,7 +187,7 @@ static int __init cpuid_init(void)
193 } 187 }
194 for_each_online_cpu(i) { 188 for_each_online_cpu(i) {
195 err = cpuid_device_create(i); 189 err = cpuid_device_create(i);
196 if (err != 0) 190 if (err != 0)
197 goto out_class; 191 goto out_class;
198 } 192 }
199 register_hotcpu_notifier(&cpuid_class_cpu_notifier); 193 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
@@ -208,7 +202,7 @@ out_class:
208 } 202 }
209 class_destroy(cpuid_class); 203 class_destroy(cpuid_class);
210out_chrdev: 204out_chrdev:
211 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 205 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
212out: 206out:
213 return err; 207 return err;
214} 208}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 40978af630e7..a47798b59f07 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
17 17
18static void doublefault_fn(void) 18static void doublefault_fn(void)
19{ 19{
20 struct Xgt_desc_struct gdt_desc = {0, 0}; 20 struct desc_ptr gdt_desc = {0, 0};
21 unsigned long gdt, tss; 21 unsigned long gdt, tss;
22 22
23 store_gdt(&gdt_desc); 23 store_gdt(&gdt_desc);
@@ -33,14 +33,15 @@ static void doublefault_fn(void)
33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss); 33 printk(KERN_EMERG "double fault, tss at %08lx\n", tss);
34 34
35 if (ptr_ok(tss)) { 35 if (ptr_ok(tss)) {
36 struct i386_hw_tss *t = (struct i386_hw_tss *)tss; 36 struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
37 37
38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); 38 printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
39 t->ip, t->sp);
39 40
40 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", 41 printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
41 t->eax, t->ebx, t->ecx, t->edx); 42 t->ax, t->bx, t->cx, t->dx);
42 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", 43 printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
43 t->esi, t->edi); 44 t->si, t->di);
44 } 45 }
45 } 46 }
46 47
@@ -50,15 +51,15 @@ static void doublefault_fn(void)
50 51
51struct tss_struct doublefault_tss __cacheline_aligned = { 52struct tss_struct doublefault_tss __cacheline_aligned = {
52 .x86_tss = { 53 .x86_tss = {
53 .esp0 = STACK_START, 54 .sp0 = STACK_START,
54 .ss0 = __KERNEL_DS, 55 .ss0 = __KERNEL_DS,
55 .ldt = 0, 56 .ldt = 0,
56 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 57 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
57 58
58 .eip = (unsigned long) doublefault_fn, 59 .ip = (unsigned long) doublefault_fn,
59 /* 0x2 bit is always set */ 60 /* 0x2 bit is always set */
60 .eflags = X86_EFLAGS_SF | 0x2, 61 .flags = X86_EFLAGS_SF | 0x2,
61 .esp = STACK_START, 62 .sp = STACK_START,
62 .es = __USER_DS, 63 .es = __USER_DS,
63 .cs = __KERNEL_CS, 64 .cs = __KERNEL_CS,
64 .ss = __KERNEL_DS, 65 .ss = __KERNEL_DS,
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
new file mode 100644
index 000000000000..dcd918c1580d
--- /dev/null
+++ b/arch/x86/kernel/ds.c
@@ -0,0 +1,464 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and
6 * precise-event based sampling (PEBS).
7 *
8 * Different architectures use a different DS layout/pointer size.
9 * The below functions therefore work on a void*.
10 *
11 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 *
15 *
16 * Copyright (C) 2007 Intel Corporation.
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */
19
20#include <asm/ds.h>
21
22#include <linux/errno.h>
23#include <linux/string.h>
24#include <linux/slab.h>
25
26
27/*
28 * Debug Store (DS) save area configuration (see Intel64 and IA32
29 * Architectures Software Developer's Manual, section 18.5)
30 *
31 * The DS configuration consists of the following fields; different
32 * architetures vary in the size of those fields.
33 * - double-word aligned base linear address of the BTS buffer
34 * - write pointer into the BTS buffer
35 * - end linear address of the BTS buffer (one byte beyond the end of
36 * the buffer)
37 * - interrupt pointer into BTS buffer
38 * (interrupt occurs when write pointer passes interrupt pointer)
39 * - double-word aligned base linear address of the PEBS buffer
40 * - write pointer into the PEBS buffer
41 * - end linear address of the PEBS buffer (one byte beyond the end of
42 * the buffer)
43 * - interrupt pointer into PEBS buffer
44 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow
46 *
47 * On later architectures, the last branch recording hardware uses
48 * 64bit pointers even in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 *
59 *
60 * In order to abstract from the actual DS and BTS layout, we describe
61 * the access to the relevant fields.
62 * Thanks to Andi Kleen for proposing this design.
63 *
64 * The implementation, however, is not as general as it might seem. In
65 * order to stay somewhat simple and efficient, we assume an
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */
69
70/*
71 * A special from_ip address to indicate that the BTS record is an
72 * info record that needs to be interpreted or skipped.
73 */
74#define BTS_ESCAPE_ADDRESS (-1)
75
76/*
77 * A field access descriptor
78 */
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82};
83
84/*
85 * The configuration for a particular DS/BTS hardware implementation.
86 */
87struct ds_configuration {
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104
105/*
106 * The global configuration used by the below accessor functions
107 */
108static struct ds_configuration ds_cfg;
109
110/*
111 * Accessor functions for some DS and BTS fields using the above
112 * global ptrace_bts_cfg.
113 */
114static inline unsigned long get_bts_buffer_base(char *base)
115{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
117}
118static inline void set_bts_buffer_base(char *base, unsigned long value)
119{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
121}
122static inline unsigned long get_bts_index(char *base)
123{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset);
125}
126static inline void set_bts_index(char *base, unsigned long value)
127{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
129}
130static inline unsigned long get_bts_absolute_maximum(char *base)
131{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
133}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value)
135{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
137}
138static inline unsigned long get_bts_interrupt_threshold(char *base)
139{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
141}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
143{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
145}
146static inline unsigned long get_from_ip(char *base)
147{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset);
149}
150static inline void set_from_ip(char *base, unsigned long value)
151{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
153}
154static inline unsigned long get_to_ip(char *base)
155{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset);
157}
158static inline void set_to_ip(char *base, unsigned long value)
159{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
161}
162static inline unsigned char get_info_type(char *base)
163{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset);
165}
166static inline void set_info_type(char *base, unsigned char value)
167{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
169}
170static inline unsigned long get_info_data(char *base)
171{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset);
173}
174static inline void set_info_data(char *base, unsigned long value)
175{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
177}
178
179
180int ds_allocate(void **dsp, size_t bts_size_in_bytes)
181{
182 size_t bts_size_in_records;
183 unsigned long bts;
184 void *ds;
185
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
187 return -EOPNOTSUPP;
188
189 if (bts_size_in_bytes < 0)
190 return -EINVAL;
191
192 bts_size_in_records =
193 bts_size_in_bytes / ds_cfg.sizeof_bts;
194 bts_size_in_bytes =
195 bts_size_in_records * ds_cfg.sizeof_bts;
196
197 if (bts_size_in_bytes <= 0)
198 return -EINVAL;
199
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
201
202 if (!bts)
203 return -ENOMEM;
204
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
206
207 if (!ds) {
208 kfree((void *)bts);
209 return -ENOMEM;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216
217 *dsp = ds;
218 return 0;
219}
220
221int ds_free(void **dsp)
222{
223 if (*dsp)
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227
228 return 0;
229}
230
231int ds_get_bts_size(void *ds)
232{
233 int size_in_bytes;
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245}
246
247int ds_get_bts_end(void *ds)
248{
249 int size_in_bytes = ds_get_bts_size(ds);
250
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253
254 return size_in_bytes / ds_cfg.sizeof_bts;
255}
256
257int ds_get_bts_index(void *ds)
258{
259 int index_offset_in_bytes;
260
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
262 return -EOPNOTSUPP;
263
264 index_offset_in_bytes =
265 get_bts_index(ds) -
266 get_bts_buffer_base(ds);
267
268 return index_offset_in_bytes / ds_cfg.sizeof_bts;
269}
270
271int ds_set_overflow(void *ds, int method)
272{
273 switch (method) {
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281}
282
283int ds_get_overflow(void *ds)
284{
285 return DS_O_WRAP;
286}
287
288int ds_clear(void *ds)
289{
290 int bts_size = ds_get_bts_size(ds);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301}
302
303int ds_read_bts(void *ds, int index, struct bts_struct *out)
304{
305 void *bts;
306
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
308 return -EOPNOTSUPP;
309
310 if (index < 0)
311 return -EINVAL;
312
313 if (index >= ds_get_bts_size(ds))
314 return -EINVAL;
315
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
317
318 memset(out, 0, sizeof(*out));
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327
328 return sizeof(*out);;
329}
330
331int ds_write_bts(void *ds, const struct bts_struct *in)
332{
333 unsigned long bts;
334
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340
341 bts = get_bts_index(ds);
342
343 memset((void *)bts, 0, ds_cfg.sizeof_bts);
344 switch (in->qualifier) {
345 case BTS_INVALID:
346 break;
347
348 case BTS_BRANCH:
349 set_from_ip((void *)bts, in->variant.lbr.from_ip);
350 set_to_ip((void *)bts, in->variant.lbr.to_ip);
351 break;
352
353 case BTS_TASK_ARRIVES:
354 case BTS_TASK_DEPARTS:
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
356 set_info_type((void *)bts, in->qualifier);
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359
360 default:
361 return -EINVAL;
362 }
363
364 bts = bts + ds_cfg.sizeof_bts;
365 if (bts >= get_bts_absolute_maximum(ds))
366 bts = get_bts_buffer_base(ds);
367 set_bts_index(ds, bts);
368
369 return ds_cfg.sizeof_bts;
370}
371
372unsigned long ds_debugctl_mask(void)
373{
374 return ds_cfg.debugctl_mask;
375}
376
377#ifdef __i386__
378static const struct ds_configuration ds_cfg_netburst = {
379 .sizeof_ds = 9 * 4,
380 .bts_buffer_base = { 0, 4 },
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391
392static const struct ds_configuration ds_cfg_pentium_m = {
393 .sizeof_ds = 9 * 4,
394 .bts_buffer_base = { 0, 4 },
395 .bts_index = { 4, 4 },
396 .bts_absolute_maximum = { 8, 4 },
397 .bts_interrupt_threshold = { 12, 4 },
398 .sizeof_bts = 3 * 4,
399 .from_ip = { 0, 4 },
400 .to_ip = { 4, 4 },
401 .info_type = { 4, 1 },
402 .info_data = { 8, 4 },
403 .debugctl_mask = (1<<6)|(1<<7)
404};
405#endif /* _i386_ */
406
407static const struct ds_configuration ds_cfg_core2 = {
408 .sizeof_ds = 9 * 8,
409 .bts_buffer_base = { 0, 8 },
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419};
420
421static inline void
422ds_configure(const struct ds_configuration *cfg)
423{
424 ds_cfg = *cfg;
425}
426
427void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
428{
429 switch (c->x86) {
430 case 0x6:
431 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD:
434 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m);
436 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2);
440 break;
441 default:
442 /* sorry, don't know about them */
443 break;
444 }
445 break;
446 case 0xF:
447 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0:
450 case 0x1:
451 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst);
453 break;
454#endif /* _i386_ */
455 default:
456 /* sorry, don't know about them */
457 break;
458 }
459 break;
460 default:
461 /* sorry, don't know about them */
462 break;
463 }
464}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 18f500d185a2..4e16ef4a2659 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -7,7 +7,6 @@
7#include <linux/kexec.h> 7#include <linux/kexec.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h> 10#include <linux/pfn.h>
12#include <linux/uaccess.h> 11#include <linux/uaccess.h>
13#include <linux/suspend.h> 12#include <linux/suspend.h>
@@ -17,11 +16,6 @@
17#include <asm/e820.h> 16#include <asm/e820.h>
18#include <asm/setup.h> 17#include <asm/setup.h>
19 18
20#ifdef CONFIG_EFI
21int efi_enabled = 0;
22EXPORT_SYMBOL(efi_enabled);
23#endif
24
25struct e820map e820; 19struct e820map e820;
26struct change_member { 20struct change_member {
27 struct e820entry *pbios; /* pointer to original bios entry */ 21 struct e820entry *pbios; /* pointer to original bios entry */
@@ -37,26 +31,6 @@ unsigned long pci_mem_start = 0x10000000;
37EXPORT_SYMBOL(pci_mem_start); 31EXPORT_SYMBOL(pci_mem_start);
38#endif 32#endif
39extern int user_defined_memmap; 33extern int user_defined_memmap;
40struct resource data_resource = {
41 .name = "Kernel data",
42 .start = 0,
43 .end = 0,
44 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
45};
46
47struct resource code_resource = {
48 .name = "Kernel code",
49 .start = 0,
50 .end = 0,
51 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
52};
53
54struct resource bss_resource = {
55 .name = "Kernel bss",
56 .start = 0,
57 .end = 0,
58 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
59};
60 34
61static struct resource system_rom_resource = { 35static struct resource system_rom_resource = {
62 .name = "System ROM", 36 .name = "System ROM",
@@ -111,60 +85,6 @@ static struct resource video_rom_resource = {
111 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM 85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
112}; 86};
113 87
114static struct resource video_ram_resource = {
115 .name = "Video RAM area",
116 .start = 0xa0000,
117 .end = 0xbffff,
118 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
119};
120
121static struct resource standard_io_resources[] = { {
122 .name = "dma1",
123 .start = 0x0000,
124 .end = 0x001f,
125 .flags = IORESOURCE_BUSY | IORESOURCE_IO
126}, {
127 .name = "pic1",
128 .start = 0x0020,
129 .end = 0x0021,
130 .flags = IORESOURCE_BUSY | IORESOURCE_IO
131}, {
132 .name = "timer0",
133 .start = 0x0040,
134 .end = 0x0043,
135 .flags = IORESOURCE_BUSY | IORESOURCE_IO
136}, {
137 .name = "timer1",
138 .start = 0x0050,
139 .end = 0x0053,
140 .flags = IORESOURCE_BUSY | IORESOURCE_IO
141}, {
142 .name = "keyboard",
143 .start = 0x0060,
144 .end = 0x006f,
145 .flags = IORESOURCE_BUSY | IORESOURCE_IO
146}, {
147 .name = "dma page reg",
148 .start = 0x0080,
149 .end = 0x008f,
150 .flags = IORESOURCE_BUSY | IORESOURCE_IO
151}, {
152 .name = "pic2",
153 .start = 0x00a0,
154 .end = 0x00a1,
155 .flags = IORESOURCE_BUSY | IORESOURCE_IO
156}, {
157 .name = "dma2",
158 .start = 0x00c0,
159 .end = 0x00df,
160 .flags = IORESOURCE_BUSY | IORESOURCE_IO
161}, {
162 .name = "fpu",
163 .start = 0x00f0,
164 .end = 0x00ff,
165 .flags = IORESOURCE_BUSY | IORESOURCE_IO
166} };
167
168#define ROMSIGNATURE 0xaa55 88#define ROMSIGNATURE 0xaa55
169 89
170static int __init romsignature(const unsigned char *rom) 90static int __init romsignature(const unsigned char *rom)
@@ -260,10 +180,9 @@ static void __init probe_roms(void)
260 * Request address space for all standard RAM and ROM resources 180 * Request address space for all standard RAM and ROM resources
261 * and also for regions reported as reserved by the e820. 181 * and also for regions reported as reserved by the e820.
262 */ 182 */
263static void __init 183void __init init_iomem_resources(struct resource *code_resource,
264legacy_init_iomem_resources(struct resource *code_resource, 184 struct resource *data_resource,
265 struct resource *data_resource, 185 struct resource *bss_resource)
266 struct resource *bss_resource)
267{ 186{
268 int i; 187 int i;
269 188
@@ -305,35 +224,6 @@ legacy_init_iomem_resources(struct resource *code_resource,
305 } 224 }
306} 225}
307 226
308/*
309 * Request address space for all standard resources
310 *
311 * This is called just before pcibios_init(), which is also a
312 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
313 */
314static int __init request_standard_resources(void)
315{
316 int i;
317
318 printk("Setting up standard PCI resources\n");
319 if (efi_enabled)
320 efi_initialize_iomem_resources(&code_resource,
321 &data_resource, &bss_resource);
322 else
323 legacy_init_iomem_resources(&code_resource,
324 &data_resource, &bss_resource);
325
326 /* EFI systems may still have VGA */
327 request_resource(&iomem_resource, &video_ram_resource);
328
329 /* request I/O space for devices used on all i[345]86 PCs */
330 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
331 request_resource(&ioport_resource, &standard_io_resources[i]);
332 return 0;
333}
334
335subsys_initcall(request_standard_resources);
336
337#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) 227#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
338/** 228/**
339 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not 229 * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
@@ -370,19 +260,17 @@ void __init add_memory_region(unsigned long long start,
370{ 260{
371 int x; 261 int x;
372 262
373 if (!efi_enabled) { 263 x = e820.nr_map;
374 x = e820.nr_map;
375
376 if (x == E820MAX) {
377 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
378 return;
379 }
380 264
381 e820.map[x].addr = start; 265 if (x == E820MAX) {
382 e820.map[x].size = size; 266 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
383 e820.map[x].type = type; 267 return;
384 e820.nr_map++;
385 } 268 }
269
270 e820.map[x].addr = start;
271 e820.map[x].size = size;
272 e820.map[x].type = type;
273 e820.nr_map++;
386} /* add_memory_region */ 274} /* add_memory_region */
387 275
388/* 276/*
@@ -598,29 +486,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
598} 486}
599 487
600/* 488/*
601 * Callback for efi_memory_walk.
602 */
603static int __init
604efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
605{
606 unsigned long *max_pfn = arg, pfn;
607
608 if (start < end) {
609 pfn = PFN_UP(end -1);
610 if (pfn > *max_pfn)
611 *max_pfn = pfn;
612 }
613 return 0;
614}
615
616static int __init
617efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
618{
619 memory_present(0, PFN_UP(start), PFN_DOWN(end));
620 return 0;
621}
622
623/*
624 * Find the highest page frame number we have available 489 * Find the highest page frame number we have available
625 */ 490 */
626void __init find_max_pfn(void) 491void __init find_max_pfn(void)
@@ -628,11 +493,6 @@ void __init find_max_pfn(void)
628 int i; 493 int i;
629 494
630 max_pfn = 0; 495 max_pfn = 0;
631 if (efi_enabled) {
632 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
633 efi_memmap_walk(efi_memory_present_wrapper, NULL);
634 return;
635 }
636 496
637 for (i = 0; i < e820.nr_map; i++) { 497 for (i = 0; i < e820.nr_map; i++) {
638 unsigned long start, end; 498 unsigned long start, end;
@@ -650,34 +510,12 @@ void __init find_max_pfn(void)
650} 510}
651 511
652/* 512/*
653 * Free all available memory for boot time allocation. Used
654 * as a callback function by efi_memory_walk()
655 */
656
657static int __init
658free_available_memory(unsigned long start, unsigned long end, void *arg)
659{
660 /* check max_low_pfn */
661 if (start >= (max_low_pfn << PAGE_SHIFT))
662 return 0;
663 if (end >= (max_low_pfn << PAGE_SHIFT))
664 end = max_low_pfn << PAGE_SHIFT;
665 if (start < end)
666 free_bootmem(start, end - start);
667
668 return 0;
669}
670/*
671 * Register fully available low RAM pages with the bootmem allocator. 513 * Register fully available low RAM pages with the bootmem allocator.
672 */ 514 */
673void __init register_bootmem_low_pages(unsigned long max_low_pfn) 515void __init register_bootmem_low_pages(unsigned long max_low_pfn)
674{ 516{
675 int i; 517 int i;
676 518
677 if (efi_enabled) {
678 efi_memmap_walk(free_available_memory, NULL);
679 return;
680 }
681 for (i = 0; i < e820.nr_map; i++) { 519 for (i = 0; i < e820.nr_map; i++) {
682 unsigned long curr_pfn, last_pfn, size; 520 unsigned long curr_pfn, last_pfn, size;
683 /* 521 /*
@@ -785,56 +623,12 @@ void __init print_memory_map(char *who)
785 } 623 }
786} 624}
787 625
788static __init __always_inline void efi_limit_regions(unsigned long long size)
789{
790 unsigned long long current_addr = 0;
791 efi_memory_desc_t *md, *next_md;
792 void *p, *p1;
793 int i, j;
794
795 j = 0;
796 p1 = memmap.map;
797 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
798 md = p;
799 next_md = p1;
800 current_addr = md->phys_addr +
801 PFN_PHYS(md->num_pages);
802 if (is_available_memory(md)) {
803 if (md->phys_addr >= size) continue;
804 memcpy(next_md, md, memmap.desc_size);
805 if (current_addr >= size) {
806 next_md->num_pages -=
807 PFN_UP(current_addr-size);
808 }
809 p1 += memmap.desc_size;
810 next_md = p1;
811 j++;
812 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
813 EFI_MEMORY_RUNTIME) {
814 /* In order to make runtime services
815 * available we have to include runtime
816 * memory regions in memory map */
817 memcpy(next_md, md, memmap.desc_size);
818 p1 += memmap.desc_size;
819 next_md = p1;
820 j++;
821 }
822 }
823 memmap.nr_map = j;
824 memmap.map_end = memmap.map +
825 (memmap.nr_map * memmap.desc_size);
826}
827
828void __init limit_regions(unsigned long long size) 626void __init limit_regions(unsigned long long size)
829{ 627{
830 unsigned long long current_addr; 628 unsigned long long current_addr;
831 int i; 629 int i;
832 630
833 print_memory_map("limit_regions start"); 631 print_memory_map("limit_regions start");
834 if (efi_enabled) {
835 efi_limit_regions(size);
836 return;
837 }
838 for (i = 0; i < e820.nr_map; i++) { 632 for (i = 0; i < e820.nr_map; i++) {
839 current_addr = e820.map[i].addr + e820.map[i].size; 633 current_addr = e820.map[i].addr + e820.map[i].size;
840 if (current_addr < size) 634 if (current_addr < size)
@@ -955,3 +749,14 @@ static int __init parse_memmap(char *arg)
955 return 0; 749 return 0;
956} 750}
957early_param("memmap", parse_memmap); 751early_param("memmap", parse_memmap);
752void __init update_e820(void)
753{
754 u8 nr_map;
755
756 nr_map = e820.nr_map;
757 if (sanitize_e820_map(e820.map, &nr_map))
758 return;
759 e820.nr_map = nr_map;
760 printk(KERN_INFO "modified physical RAM map:\n");
761 print_memory_map("modified");
762}
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 04698e0b056c..9f65b4cc323c 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -1,4 +1,4 @@
1/* 1/*
2 * Handle the memory map. 2 * Handle the memory map.
3 * The functions here do the job until bootmem takes over. 3 * The functions here do the job until bootmem takes over.
4 * 4 *
@@ -26,80 +26,92 @@
26#include <asm/proto.h> 26#include <asm/proto.h>
27#include <asm/setup.h> 27#include <asm/setup.h>
28#include <asm/sections.h> 28#include <asm/sections.h>
29#include <asm/kdebug.h>
29 30
30struct e820map e820; 31struct e820map e820;
31 32
32/* 33/*
33 * PFN of last memory page. 34 * PFN of last memory page.
34 */ 35 */
35unsigned long end_pfn; 36unsigned long end_pfn;
36EXPORT_SYMBOL(end_pfn);
37 37
38/* 38/*
39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. 39 * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
40 * The direct mapping extends to end_pfn_map, so that we can directly access 40 * The direct mapping extends to end_pfn_map, so that we can directly access
41 * apertures, ACPI and other tables without having to play with fixmaps. 41 * apertures, ACPI and other tables without having to play with fixmaps.
42 */ 42 */
43unsigned long end_pfn_map; 43unsigned long end_pfn_map;
44 44
45/* 45/*
46 * Last pfn which the user wants to use. 46 * Last pfn which the user wants to use.
47 */ 47 */
48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; 48static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
49 49
50extern struct resource code_resource, data_resource, bss_resource; 50/*
51 51 * Early reserved memory areas.
52/* Check for some hardcoded bad areas that early boot is not allowed to touch */ 52 */
53static inline int bad_addr(unsigned long *addrp, unsigned long size) 53#define MAX_EARLY_RES 20
54{ 54
55 unsigned long addr = *addrp, last = addr + size; 55struct early_res {
56 56 unsigned long start, end;
57 /* various gunk below that needed for SMP startup */ 57 char name[16];
58 if (addr < 0x8000) { 58};
59 *addrp = PAGE_ALIGN(0x8000); 59static struct early_res early_res[MAX_EARLY_RES] __initdata = {
60 return 1; 60 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
61 } 61#ifdef CONFIG_SMP
62 62 { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
63 /* direct mapping tables of the kernel */
64 if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
65 *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT);
66 return 1;
67 }
68
69 /* initrd */
70#ifdef CONFIG_BLK_DEV_INITRD
71 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
72 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
73 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
74 unsigned long ramdisk_end = ramdisk_image+ramdisk_size;
75
76 if (last >= ramdisk_image && addr < ramdisk_end) {
77 *addrp = PAGE_ALIGN(ramdisk_end);
78 return 1;
79 }
80 }
81#endif 63#endif
82 /* kernel code */ 64 {}
83 if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { 65};
84 *addrp = PAGE_ALIGN(__pa_symbol(&_end)); 66
85 return 1; 67void __init reserve_early(unsigned long start, unsigned long end, char *name)
68{
69 int i;
70 struct early_res *r;
71 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
72 r = &early_res[i];
73 if (end > r->start && start < r->end)
74 panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
75 start, end - 1, name?name:"", r->start, r->end - 1, r->name);
86 } 76 }
77 if (i >= MAX_EARLY_RES)
78 panic("Too many early reservations");
79 r = &early_res[i];
80 r->start = start;
81 r->end = end;
82 if (name)
83 strncpy(r->name, name, sizeof(r->name) - 1);
84}
87 85
88 if (last >= ebda_addr && addr < ebda_addr + ebda_size) { 86void __init early_res_to_bootmem(void)
89 *addrp = PAGE_ALIGN(ebda_addr + ebda_size); 87{
90 return 1; 88 int i;
89 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
90 struct early_res *r = &early_res[i];
91 printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
92 r->start, r->end - 1, r->name);
93 reserve_bootmem_generic(r->start, r->end - r->start);
91 } 94 }
95}
92 96
93#ifdef CONFIG_NUMA 97/* Check for already reserved areas */
94 /* NUMA memory to node map */ 98static inline int bad_addr(unsigned long *addrp, unsigned long size)
95 if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { 99{
96 *addrp = nodemap_addr + nodemap_size; 100 int i;
97 return 1; 101 unsigned long addr = *addrp, last;
102 int changed = 0;
103again:
104 last = addr + size;
105 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
106 struct early_res *r = &early_res[i];
107 if (last >= r->start && addr < r->end) {
108 *addrp = addr = r->end;
109 changed = 1;
110 goto again;
111 }
98 } 112 }
99#endif 113 return changed;
100 /* XXX ramdisk image here? */ 114}
101 return 0;
102}
103 115
104/* 116/*
105 * This function checks if any part of the range <start,end> is mapped 117 * This function checks if any part of the range <start,end> is mapped
@@ -107,16 +119,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
107 */ 119 */
108int 120int
109e820_any_mapped(unsigned long start, unsigned long end, unsigned type) 121e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
110{ 122{
111 int i; 123 int i;
112 for (i = 0; i < e820.nr_map; i++) { 124
113 struct e820entry *ei = &e820.map[i]; 125 for (i = 0; i < e820.nr_map; i++) {
114 if (type && ei->type != type) 126 struct e820entry *ei = &e820.map[i];
127
128 if (type && ei->type != type)
115 continue; 129 continue;
116 if (ei->addr >= end || ei->addr + ei->size <= start) 130 if (ei->addr >= end || ei->addr + ei->size <= start)
117 continue; 131 continue;
118 return 1; 132 return 1;
119 } 133 }
120 return 0; 134 return 0;
121} 135}
122EXPORT_SYMBOL_GPL(e820_any_mapped); 136EXPORT_SYMBOL_GPL(e820_any_mapped);
@@ -127,11 +141,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
127 * Note: this function only works correct if the e820 table is sorted and 141 * Note: this function only works correct if the e820 table is sorted and
128 * not-overlapping, which is the case 142 * not-overlapping, which is the case
129 */ 143 */
130int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) 144int __init e820_all_mapped(unsigned long start, unsigned long end,
145 unsigned type)
131{ 146{
132 int i; 147 int i;
148
133 for (i = 0; i < e820.nr_map; i++) { 149 for (i = 0; i < e820.nr_map; i++) {
134 struct e820entry *ei = &e820.map[i]; 150 struct e820entry *ei = &e820.map[i];
151
135 if (type && ei->type != type) 152 if (type && ei->type != type)
136 continue; 153 continue;
137 /* is the region (part) in overlap with the current region ?*/ 154 /* is the region (part) in overlap with the current region ?*/
@@ -143,65 +160,75 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type
143 */ 160 */
144 if (ei->addr <= start) 161 if (ei->addr <= start)
145 start = ei->addr + ei->size; 162 start = ei->addr + ei->size;
146 /* if start is now at or beyond end, we're done, full coverage */ 163 /*
164 * if start is now at or beyond end, we're done, full
165 * coverage
166 */
147 if (start >= end) 167 if (start >= end)
148 return 1; /* we're done */ 168 return 1;
149 } 169 }
150 return 0; 170 return 0;
151} 171}
152 172
153/* 173/*
154 * Find a free area in a specific range. 174 * Find a free area with specified alignment in a specific range.
155 */ 175 */
156unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) 176unsigned long __init find_e820_area(unsigned long start, unsigned long end,
157{ 177 unsigned size, unsigned long align)
158 int i; 178{
159 for (i = 0; i < e820.nr_map; i++) { 179 int i;
160 struct e820entry *ei = &e820.map[i]; 180 unsigned long mask = ~(align - 1);
161 unsigned long addr = ei->addr, last; 181
162 if (ei->type != E820_RAM) 182 for (i = 0; i < e820.nr_map; i++) {
163 continue; 183 struct e820entry *ei = &e820.map[i];
164 if (addr < start) 184 unsigned long addr = ei->addr, last;
185
186 if (ei->type != E820_RAM)
187 continue;
188 if (addr < start)
165 addr = start; 189 addr = start;
166 if (addr > ei->addr + ei->size) 190 if (addr > ei->addr + ei->size)
167 continue; 191 continue;
168 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) 192 while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
169 ; 193 ;
170 last = PAGE_ALIGN(addr) + size; 194 addr = (addr + align - 1) & mask;
195 last = addr + size;
171 if (last > ei->addr + ei->size) 196 if (last > ei->addr + ei->size)
172 continue; 197 continue;
173 if (last > end) 198 if (last > end)
174 continue; 199 continue;
175 return addr; 200 return addr;
176 } 201 }
177 return -1UL; 202 return -1UL;
178} 203}
179 204
180/* 205/*
181 * Find the highest page frame number we have available 206 * Find the highest page frame number we have available
182 */ 207 */
183unsigned long __init e820_end_of_ram(void) 208unsigned long __init e820_end_of_ram(void)
184{ 209{
185 unsigned long end_pfn = 0; 210 unsigned long end_pfn;
211
186 end_pfn = find_max_pfn_with_active_regions(); 212 end_pfn = find_max_pfn_with_active_regions();
187 213
188 if (end_pfn > end_pfn_map) 214 if (end_pfn > end_pfn_map)
189 end_pfn_map = end_pfn; 215 end_pfn_map = end_pfn;
190 if (end_pfn_map > MAXMEM>>PAGE_SHIFT) 216 if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
191 end_pfn_map = MAXMEM>>PAGE_SHIFT; 217 end_pfn_map = MAXMEM>>PAGE_SHIFT;
192 if (end_pfn > end_user_pfn) 218 if (end_pfn > end_user_pfn)
193 end_pfn = end_user_pfn; 219 end_pfn = end_user_pfn;
194 if (end_pfn > end_pfn_map) 220 if (end_pfn > end_pfn_map)
195 end_pfn = end_pfn_map; 221 end_pfn = end_pfn_map;
196 222
197 printk("end_pfn_map = %lu\n", end_pfn_map); 223 printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
198 return end_pfn; 224 return end_pfn;
199} 225}
200 226
201/* 227/*
202 * Mark e820 reserved areas as busy for the resource manager. 228 * Mark e820 reserved areas as busy for the resource manager.
203 */ 229 */
204void __init e820_reserve_resources(void) 230void __init e820_reserve_resources(struct resource *code_resource,
231 struct resource *data_resource, struct resource *bss_resource)
205{ 232{
206 int i; 233 int i;
207 for (i = 0; i < e820.nr_map; i++) { 234 for (i = 0; i < e820.nr_map; i++) {
@@ -219,13 +246,13 @@ void __init e820_reserve_resources(void)
219 request_resource(&iomem_resource, res); 246 request_resource(&iomem_resource, res);
220 if (e820.map[i].type == E820_RAM) { 247 if (e820.map[i].type == E820_RAM) {
221 /* 248 /*
222 * We don't know which RAM region contains kernel data, 249 * We don't know which RAM region contains kernel data,
223 * so we try it repeatedly and let the resource manager 250 * so we try it repeatedly and let the resource manager
224 * test it. 251 * test it.
225 */ 252 */
226 request_resource(res, &code_resource); 253 request_resource(res, code_resource);
227 request_resource(res, &data_resource); 254 request_resource(res, data_resource);
228 request_resource(res, &bss_resource); 255 request_resource(res, bss_resource);
229#ifdef CONFIG_KEXEC 256#ifdef CONFIG_KEXEC
230 if (crashk_res.start != crashk_res.end) 257 if (crashk_res.start != crashk_res.end)
231 request_resource(res, &crashk_res); 258 request_resource(res, &crashk_res);
@@ -322,9 +349,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn,
322 add_active_range(nid, ei_startpfn, ei_endpfn); 349 add_active_range(nid, ei_startpfn, ei_endpfn);
323} 350}
324 351
325/* 352/*
326 * Add a memory region to the kernel e820 map. 353 * Add a memory region to the kernel e820 map.
327 */ 354 */
328void __init add_memory_region(unsigned long start, unsigned long size, int type) 355void __init add_memory_region(unsigned long start, unsigned long size, int type)
329{ 356{
330 int x = e820.nr_map; 357 int x = e820.nr_map;
@@ -349,9 +376,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
349{ 376{
350 unsigned long start_pfn = start >> PAGE_SHIFT; 377 unsigned long start_pfn = start >> PAGE_SHIFT;
351 unsigned long end_pfn = end >> PAGE_SHIFT; 378 unsigned long end_pfn = end >> PAGE_SHIFT;
352 unsigned long ei_startpfn; 379 unsigned long ei_startpfn, ei_endpfn, ram = 0;
353 unsigned long ei_endpfn;
354 unsigned long ram = 0;
355 int i; 380 int i;
356 381
357 for (i = 0; i < e820.nr_map; i++) { 382 for (i = 0; i < e820.nr_map; i++) {
@@ -363,28 +388,31 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
363 return end - start - (ram << PAGE_SHIFT); 388 return end - start - (ram << PAGE_SHIFT);
364} 389}
365 390
366void __init e820_print_map(char *who) 391static void __init e820_print_map(char *who)
367{ 392{
368 int i; 393 int i;
369 394
370 for (i = 0; i < e820.nr_map; i++) { 395 for (i = 0; i < e820.nr_map; i++) {
371 printk(KERN_INFO " %s: %016Lx - %016Lx ", who, 396 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
372 (unsigned long long) e820.map[i].addr, 397 (unsigned long long) e820.map[i].addr,
373 (unsigned long long) (e820.map[i].addr + e820.map[i].size)); 398 (unsigned long long)
399 (e820.map[i].addr + e820.map[i].size));
374 switch (e820.map[i].type) { 400 switch (e820.map[i].type) {
375 case E820_RAM: printk("(usable)\n"); 401 case E820_RAM:
376 break; 402 printk(KERN_CONT "(usable)\n");
403 break;
377 case E820_RESERVED: 404 case E820_RESERVED:
378 printk("(reserved)\n"); 405 printk(KERN_CONT "(reserved)\n");
379 break; 406 break;
380 case E820_ACPI: 407 case E820_ACPI:
381 printk("(ACPI data)\n"); 408 printk(KERN_CONT "(ACPI data)\n");
382 break; 409 break;
383 case E820_NVS: 410 case E820_NVS:
384 printk("(ACPI NVS)\n"); 411 printk(KERN_CONT "(ACPI NVS)\n");
385 break; 412 break;
386 default: printk("type %u\n", e820.map[i].type); 413 default:
387 break; 414 printk(KERN_CONT "type %u\n", e820.map[i].type);
415 break;
388 } 416 }
389 } 417 }
390} 418}
@@ -392,11 +420,11 @@ void __init e820_print_map(char *who)
392/* 420/*
393 * Sanitize the BIOS e820 map. 421 * Sanitize the BIOS e820 map.
394 * 422 *
395 * Some e820 responses include overlapping entries. The following 423 * Some e820 responses include overlapping entries. The following
396 * replaces the original e820 map with a new one, removing overlaps. 424 * replaces the original e820 map with a new one, removing overlaps.
397 * 425 *
398 */ 426 */
399static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) 427static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
400{ 428{
401 struct change_member { 429 struct change_member {
402 struct e820entry *pbios; /* pointer to original bios entry */ 430 struct e820entry *pbios; /* pointer to original bios entry */
@@ -416,7 +444,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
416 int i; 444 int i;
417 445
418 /* 446 /*
419 Visually we're performing the following (1,2,3,4 = memory types)... 447 Visually we're performing the following
448 (1,2,3,4 = memory types)...
420 449
421 Sample memory map (w/overlaps): 450 Sample memory map (w/overlaps):
422 ____22__________________ 451 ____22__________________
@@ -458,22 +487,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
458 old_nr = *pnr_map; 487 old_nr = *pnr_map;
459 488
460 /* bail out if we find any unreasonable addresses in bios map */ 489 /* bail out if we find any unreasonable addresses in bios map */
461 for (i=0; i<old_nr; i++) 490 for (i = 0; i < old_nr; i++)
462 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) 491 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
463 return -1; 492 return -1;
464 493
465 /* create pointers for initial change-point information (for sorting) */ 494 /* create pointers for initial change-point information (for sorting) */
466 for (i=0; i < 2*old_nr; i++) 495 for (i = 0; i < 2 * old_nr; i++)
467 change_point[i] = &change_point_list[i]; 496 change_point[i] = &change_point_list[i];
468 497
469 /* record all known change-points (starting and ending addresses), 498 /* record all known change-points (starting and ending addresses),
470 omitting those that are for empty memory regions */ 499 omitting those that are for empty memory regions */
471 chgidx = 0; 500 chgidx = 0;
472 for (i=0; i < old_nr; i++) { 501 for (i = 0; i < old_nr; i++) {
473 if (biosmap[i].size != 0) { 502 if (biosmap[i].size != 0) {
474 change_point[chgidx]->addr = biosmap[i].addr; 503 change_point[chgidx]->addr = biosmap[i].addr;
475 change_point[chgidx++]->pbios = &biosmap[i]; 504 change_point[chgidx++]->pbios = &biosmap[i];
476 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; 505 change_point[chgidx]->addr = biosmap[i].addr +
506 biosmap[i].size;
477 change_point[chgidx++]->pbios = &biosmap[i]; 507 change_point[chgidx++]->pbios = &biosmap[i];
478 } 508 }
479 } 509 }
@@ -483,75 +513,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
483 still_changing = 1; 513 still_changing = 1;
484 while (still_changing) { 514 while (still_changing) {
485 still_changing = 0; 515 still_changing = 0;
486 for (i=1; i < chg_nr; i++) { 516 for (i = 1; i < chg_nr; i++) {
487 /* if <current_addr> > <last_addr>, swap */ 517 unsigned long long curaddr, lastaddr;
488 /* or, if current=<start_addr> & last=<end_addr>, swap */ 518 unsigned long long curpbaddr, lastpbaddr;
489 if ((change_point[i]->addr < change_point[i-1]->addr) || 519
490 ((change_point[i]->addr == change_point[i-1]->addr) && 520 curaddr = change_point[i]->addr;
491 (change_point[i]->addr == change_point[i]->pbios->addr) && 521 lastaddr = change_point[i - 1]->addr;
492 (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) 522 curpbaddr = change_point[i]->pbios->addr;
493 ) 523 lastpbaddr = change_point[i - 1]->pbios->addr;
494 { 524
525 /*
526 * swap entries, when:
527 *
528 * curaddr > lastaddr or
529 * curaddr == lastaddr and curaddr == curpbaddr and
530 * lastaddr != lastpbaddr
531 */
532 if (curaddr < lastaddr ||
533 (curaddr == lastaddr && curaddr == curpbaddr &&
534 lastaddr != lastpbaddr)) {
495 change_tmp = change_point[i]; 535 change_tmp = change_point[i];
496 change_point[i] = change_point[i-1]; 536 change_point[i] = change_point[i-1];
497 change_point[i-1] = change_tmp; 537 change_point[i-1] = change_tmp;
498 still_changing=1; 538 still_changing = 1;
499 } 539 }
500 } 540 }
501 } 541 }
502 542
503 /* create a new bios memory map, removing overlaps */ 543 /* create a new bios memory map, removing overlaps */
504 overlap_entries=0; /* number of entries in the overlap table */ 544 overlap_entries = 0; /* number of entries in the overlap table */
505 new_bios_entry=0; /* index for creating new bios map entries */ 545 new_bios_entry = 0; /* index for creating new bios map entries */
506 last_type = 0; /* start with undefined memory type */ 546 last_type = 0; /* start with undefined memory type */
507 last_addr = 0; /* start with 0 as last starting address */ 547 last_addr = 0; /* start with 0 as last starting address */
548
508 /* loop through change-points, determining affect on the new bios map */ 549 /* loop through change-points, determining affect on the new bios map */
509 for (chgidx=0; chgidx < chg_nr; chgidx++) 550 for (chgidx = 0; chgidx < chg_nr; chgidx++) {
510 {
511 /* keep track of all overlapping bios entries */ 551 /* keep track of all overlapping bios entries */
512 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) 552 if (change_point[chgidx]->addr ==
513 { 553 change_point[chgidx]->pbios->addr) {
514 /* add map entry to overlap list (> 1 entry implies an overlap) */ 554 /*
515 overlap_list[overlap_entries++]=change_point[chgidx]->pbios; 555 * add map entry to overlap list (> 1 entry
516 } 556 * implies an overlap)
517 else 557 */
518 { 558 overlap_list[overlap_entries++] =
519 /* remove entry from list (order independent, so swap with last) */ 559 change_point[chgidx]->pbios;
520 for (i=0; i<overlap_entries; i++) 560 } else {
521 { 561 /*
522 if (overlap_list[i] == change_point[chgidx]->pbios) 562 * remove entry from list (order independent,
523 overlap_list[i] = overlap_list[overlap_entries-1]; 563 * so swap with last)
564 */
565 for (i = 0; i < overlap_entries; i++) {
566 if (overlap_list[i] ==
567 change_point[chgidx]->pbios)
568 overlap_list[i] =
569 overlap_list[overlap_entries-1];
524 } 570 }
525 overlap_entries--; 571 overlap_entries--;
526 } 572 }
527 /* if there are overlapping entries, decide which "type" to use */ 573 /*
528 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ 574 * if there are overlapping entries, decide which
575 * "type" to use (larger value takes precedence --
576 * 1=usable, 2,3,4,4+=unusable)
577 */
529 current_type = 0; 578 current_type = 0;
530 for (i=0; i<overlap_entries; i++) 579 for (i = 0; i < overlap_entries; i++)
531 if (overlap_list[i]->type > current_type) 580 if (overlap_list[i]->type > current_type)
532 current_type = overlap_list[i]->type; 581 current_type = overlap_list[i]->type;
533 /* continue building up new bios map based on this information */ 582 /*
583 * continue building up new bios map based on this
584 * information
585 */
534 if (current_type != last_type) { 586 if (current_type != last_type) {
535 if (last_type != 0) { 587 if (last_type != 0) {
536 new_bios[new_bios_entry].size = 588 new_bios[new_bios_entry].size =
537 change_point[chgidx]->addr - last_addr; 589 change_point[chgidx]->addr - last_addr;
538 /* move forward only if the new size was non-zero */ 590 /*
591 * move forward only if the new size
592 * was non-zero
593 */
539 if (new_bios[new_bios_entry].size != 0) 594 if (new_bios[new_bios_entry].size != 0)
595 /*
596 * no more space left for new
597 * bios entries ?
598 */
540 if (++new_bios_entry >= E820MAX) 599 if (++new_bios_entry >= E820MAX)
541 break; /* no more space left for new bios entries */ 600 break;
542 } 601 }
543 if (current_type != 0) { 602 if (current_type != 0) {
544 new_bios[new_bios_entry].addr = change_point[chgidx]->addr; 603 new_bios[new_bios_entry].addr =
604 change_point[chgidx]->addr;
545 new_bios[new_bios_entry].type = current_type; 605 new_bios[new_bios_entry].type = current_type;
546 last_addr=change_point[chgidx]->addr; 606 last_addr = change_point[chgidx]->addr;
547 } 607 }
548 last_type = current_type; 608 last_type = current_type;
549 } 609 }
550 } 610 }
551 new_nr = new_bios_entry; /* retain count for new bios entries */ 611 /* retain count for new bios entries */
612 new_nr = new_bios_entry;
552 613
553 /* copy new bios mapping into original location */ 614 /* copy new bios mapping into original location */
554 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); 615 memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry));
555 *pnr_map = new_nr; 616 *pnr_map = new_nr;
556 617
557 return 0; 618 return 0;
@@ -566,7 +627,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
566 * will have given us a memory map that we can use to properly 627 * will have given us a memory map that we can use to properly
567 * set up memory. If we aren't, we'll fake a memory map. 628 * set up memory. If we aren't, we'll fake a memory map.
568 */ 629 */
569static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) 630static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
570{ 631{
571 /* Only one memory region (or negative)? Ignore it */ 632 /* Only one memory region (or negative)? Ignore it */
572 if (nr_map < 2) 633 if (nr_map < 2)
@@ -583,18 +644,20 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
583 return -1; 644 return -1;
584 645
585 add_memory_region(start, size, type); 646 add_memory_region(start, size, type);
586 } while (biosmap++,--nr_map); 647 } while (biosmap++, --nr_map);
587 return 0; 648 return 0;
588} 649}
589 650
590void early_panic(char *msg) 651static void early_panic(char *msg)
591{ 652{
592 early_printk(msg); 653 early_printk(msg);
593 panic(msg); 654 panic(msg);
594} 655}
595 656
596void __init setup_memory_region(void) 657/* We're not void only for x86 32-bit compat */
658char * __init machine_specific_memory_setup(void)
597{ 659{
660 char *who = "BIOS-e820";
598 /* 661 /*
599 * Try to copy the BIOS-supplied E820-map. 662 * Try to copy the BIOS-supplied E820-map.
600 * 663 *
@@ -605,7 +668,10 @@ void __init setup_memory_region(void)
605 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) 668 if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
606 early_panic("Cannot find a valid memory map"); 669 early_panic("Cannot find a valid memory map");
607 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 670 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
608 e820_print_map("BIOS-e820"); 671 e820_print_map(who);
672
673 /* In case someone cares... */
674 return who;
609} 675}
610 676
611static int __init parse_memopt(char *p) 677static int __init parse_memopt(char *p)
@@ -613,9 +679,9 @@ static int __init parse_memopt(char *p)
613 if (!p) 679 if (!p)
614 return -EINVAL; 680 return -EINVAL;
615 end_user_pfn = memparse(p, &p); 681 end_user_pfn = memparse(p, &p);
616 end_user_pfn >>= PAGE_SHIFT; 682 end_user_pfn >>= PAGE_SHIFT;
617 return 0; 683 return 0;
618} 684}
619early_param("mem", parse_memopt); 685early_param("mem", parse_memopt);
620 686
621static int userdef __initdata; 687static int userdef __initdata;
@@ -627,9 +693,9 @@ static int __init parse_memmap_opt(char *p)
627 693
628 if (!strcmp(p, "exactmap")) { 694 if (!strcmp(p, "exactmap")) {
629#ifdef CONFIG_CRASH_DUMP 695#ifdef CONFIG_CRASH_DUMP
630 /* If we are doing a crash dump, we 696 /*
631 * still need to know the real mem 697 * If we are doing a crash dump, we still need to know
632 * size before original memory map is 698 * the real mem size before original memory map is
633 * reset. 699 * reset.
634 */ 700 */
635 e820_register_active_regions(0, 0, -1UL); 701 e820_register_active_regions(0, 0, -1UL);
@@ -646,6 +712,8 @@ static int __init parse_memmap_opt(char *p)
646 mem_size = memparse(p, &p); 712 mem_size = memparse(p, &p);
647 if (p == oldp) 713 if (p == oldp)
648 return -EINVAL; 714 return -EINVAL;
715
716 userdef = 1;
649 if (*p == '@') { 717 if (*p == '@') {
650 start_at = memparse(p+1, &p); 718 start_at = memparse(p+1, &p);
651 add_memory_region(start_at, mem_size, E820_RAM); 719 add_memory_region(start_at, mem_size, E820_RAM);
@@ -665,11 +733,29 @@ early_param("memmap", parse_memmap_opt);
665void __init finish_e820_parsing(void) 733void __init finish_e820_parsing(void)
666{ 734{
667 if (userdef) { 735 if (userdef) {
736 char nr = e820.nr_map;
737
738 if (sanitize_e820_map(e820.map, &nr) < 0)
739 early_panic("Invalid user supplied memory map");
740 e820.nr_map = nr;
741
668 printk(KERN_INFO "user-defined physical RAM map:\n"); 742 printk(KERN_INFO "user-defined physical RAM map:\n");
669 e820_print_map("user"); 743 e820_print_map("user");
670 } 744 }
671} 745}
672 746
747void __init update_e820(void)
748{
749 u8 nr_map;
750
751 nr_map = e820.nr_map;
752 if (sanitize_e820_map(e820.map, &nr_map))
753 return;
754 e820.nr_map = nr_map;
755 printk(KERN_INFO "modified physical RAM map:\n");
756 e820_print_map("modified");
757}
758
673unsigned long pci_mem_start = 0xaeedbabe; 759unsigned long pci_mem_start = 0xaeedbabe;
674EXPORT_SYMBOL(pci_mem_start); 760EXPORT_SYMBOL(pci_mem_start);
675 761
@@ -713,8 +799,10 @@ __init void e820_setup_gap(void)
713 799
714 if (!found) { 800 if (!found) {
715 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; 801 gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
716 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" 802 printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
717 KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); 803 "address range\n"
804 KERN_ERR "PCI: Unassigned devices with 32bit resource "
805 "registers may break!\n");
718 } 806 }
719 807
720 /* 808 /*
@@ -727,8 +815,9 @@ __init void e820_setup_gap(void)
727 /* Fun with two's complement */ 815 /* Fun with two's complement */
728 pci_mem_start = (gapstart + round) & -round; 816 pci_mem_start = (gapstart + round) & -round;
729 817
730 printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", 818 printk(KERN_INFO
731 pci_mem_start, gapstart, gapsize); 819 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
820 pci_mem_start, gapstart, gapsize);
732} 821}
733 822
734int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) 823int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 88bb83ec895f..9f51e1ea9e82 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -21,7 +21,33 @@
21#include <asm/gart.h> 21#include <asm/gart.h>
22#endif 22#endif
23 23
24static void __init via_bugs(void) 24static void __init fix_hypertransport_config(int num, int slot, int func)
25{
26 u32 htcfg;
27 /*
28 * we found a hypertransport bus
29 * make sure that we are broadcasting
30 * interrupts to all cpus on the ht bus
31 * if we're using extended apic ids
32 */
33 htcfg = read_pci_config(num, slot, func, 0x68);
34 if (htcfg & (1 << 18)) {
35 printk(KERN_INFO "Detected use of extended apic ids "
36 "on hypertransport bus\n");
37 if ((htcfg & (1 << 17)) == 0) {
38 printk(KERN_INFO "Enabling hypertransport extended "
39 "apic interrupt broadcast\n");
40 printk(KERN_INFO "Note this is a bios bug, "
41 "please contact your hw vendor\n");
42 htcfg |= (1 << 17);
43 write_pci_config(num, slot, func, 0x68, htcfg);
44 }
45 }
46
47
48}
49
50static void __init via_bugs(int num, int slot, int func)
25{ 51{
26#ifdef CONFIG_GART_IOMMU 52#ifdef CONFIG_GART_IOMMU
27 if ((end_pfn > MAX_DMA32_PFN || force_iommu) && 53 if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
@@ -44,7 +70,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header)
44#endif /* CONFIG_X86_IO_APIC */ 70#endif /* CONFIG_X86_IO_APIC */
45#endif /* CONFIG_ACPI */ 71#endif /* CONFIG_ACPI */
46 72
47static void __init nvidia_bugs(void) 73static void __init nvidia_bugs(int num, int slot, int func)
48{ 74{
49#ifdef CONFIG_ACPI 75#ifdef CONFIG_ACPI
50#ifdef CONFIG_X86_IO_APIC 76#ifdef CONFIG_X86_IO_APIC
@@ -72,7 +98,7 @@ static void __init nvidia_bugs(void)
72 98
73} 99}
74 100
75static void __init ati_bugs(void) 101static void __init ati_bugs(int num, int slot, int func)
76{ 102{
77#ifdef CONFIG_X86_IO_APIC 103#ifdef CONFIG_X86_IO_APIC
78 if (timer_over_8254 == 1) { 104 if (timer_over_8254 == 1) {
@@ -83,18 +109,67 @@ static void __init ati_bugs(void)
83#endif 109#endif
84} 110}
85 111
112#define QFLAG_APPLY_ONCE 0x1
113#define QFLAG_APPLIED 0x2
114#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
86struct chipset { 115struct chipset {
87 u16 vendor; 116 u32 vendor;
88 void (*f)(void); 117 u32 device;
118 u32 class;
119 u32 class_mask;
120 u32 flags;
121 void (*f)(int num, int slot, int func);
89}; 122};
90 123
91static struct chipset early_qrk[] __initdata = { 124static struct chipset early_qrk[] __initdata = {
92 { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, 125 { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
93 { PCI_VENDOR_ID_VIA, via_bugs }, 126 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
94 { PCI_VENDOR_ID_ATI, ati_bugs }, 127 { PCI_VENDOR_ID_VIA, PCI_ANY_ID,
128 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
129 { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
130 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
131 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
132 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
95 {} 133 {}
96}; 134};
97 135
136static void __init check_dev_quirk(int num, int slot, int func)
137{
138 u16 class;
139 u16 vendor;
140 u16 device;
141 u8 type;
142 int i;
143
144 class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
145
146 if (class == 0xffff)
147 return;
148
149 vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
150
151 device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
152
153 for (i = 0; early_qrk[i].f != NULL; i++) {
154 if (((early_qrk[i].vendor == PCI_ANY_ID) ||
155 (early_qrk[i].vendor == vendor)) &&
156 ((early_qrk[i].device == PCI_ANY_ID) ||
157 (early_qrk[i].device == device)) &&
158 (!((early_qrk[i].class ^ class) &
159 early_qrk[i].class_mask))) {
160 if ((early_qrk[i].flags &
161 QFLAG_DONE) != QFLAG_DONE)
162 early_qrk[i].f(num, slot, func);
163 early_qrk[i].flags |= QFLAG_APPLIED;
164 }
165 }
166
167 type = read_pci_config_byte(num, slot, func,
168 PCI_HEADER_TYPE);
169 if (!(type & 0x80))
170 return;
171}
172
98void __init early_quirks(void) 173void __init early_quirks(void)
99{ 174{
100 int num, slot, func; 175 int num, slot, func;
@@ -103,36 +178,8 @@ void __init early_quirks(void)
103 return; 178 return;
104 179
105 /* Poor man's PCI discovery */ 180 /* Poor man's PCI discovery */
106 for (num = 0; num < 32; num++) { 181 for (num = 0; num < 32; num++)
107 for (slot = 0; slot < 32; slot++) { 182 for (slot = 0; slot < 32; slot++)
108 for (func = 0; func < 8; func++) { 183 for (func = 0; func < 8; func++)
109 u32 class; 184 check_dev_quirk(num, slot, func);
110 u32 vendor;
111 u8 type;
112 int i;
113 class = read_pci_config(num,slot,func,
114 PCI_CLASS_REVISION);
115 if (class == 0xffffffff)
116 break;
117
118 if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
119 continue;
120
121 vendor = read_pci_config(num, slot, func,
122 PCI_VENDOR_ID);
123 vendor &= 0xffff;
124
125 for (i = 0; early_qrk[i].f; i++)
126 if (early_qrk[i].vendor == vendor) {
127 early_qrk[i].f();
128 return;
129 }
130
131 type = read_pci_config_byte(num, slot, func,
132 PCI_HEADER_TYPE);
133 if (!(type & 0x80))
134 break;
135 }
136 }
137 }
138} 185}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b7d6c23f2871..cff84cd9987f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -193,7 +193,7 @@ static struct console simnow_console = {
193}; 193};
194 194
195/* Direct interface for emergencies */ 195/* Direct interface for emergencies */
196struct console *early_console = &early_vga_console; 196static struct console *early_console = &early_vga_console;
197static int early_console_initialized = 0; 197static int early_console_initialized = 0;
198 198
199void early_printk(const char *fmt, ...) 199void early_printk(const char *fmt, ...)
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
new file mode 100644
index 000000000000..32dd62b36ff7
--- /dev/null
+++ b/arch/x86/kernel/efi.c
@@ -0,0 +1,515 @@
1/*
2 * Common EFI (Extensible Firmware Interface) support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 1999 VA Linux Systems
6 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
7 * Copyright (C) 1999-2002 Hewlett-Packard Co.
8 * David Mosberger-Tang <davidm@hpl.hp.com>
9 * Stephane Eranian <eranian@hpl.hp.com>
10 * Copyright (C) 2005-2008 Intel Co.
11 * Fenghua Yu <fenghua.yu@intel.com>
12 * Bibo Mao <bibo.mao@intel.com>
13 * Chandramouli Narayanan <mouli@linux.intel.com>
14 * Huang Ying <ying.huang@intel.com>
15 *
16 * Copied from efi_32.c to eliminate the duplicated code between EFI
17 * 32/64 support code. --ying 2007-10-26
18 *
19 * All EFI Runtime Services are not implemented yet as EFI only
20 * supports physical mode addressing on SoftSDV. This is to be fixed
21 * in a future version. --drummond 1999-07-20
22 *
23 * Implemented EFI runtime services and virtual mode calls. --davidm
24 *
25 * Goutham Rao: <goutham.rao@intel.com>
26 * Skip non-WB memory and ignore empty memory ranges.
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/efi.h>
32#include <linux/bootmem.h>
33#include <linux/spinlock.h>
34#include <linux/uaccess.h>
35#include <linux/time.h>
36#include <linux/io.h>
37#include <linux/reboot.h>
38#include <linux/bcd.h>
39
40#include <asm/setup.h>
41#include <asm/efi.h>
42#include <asm/time.h>
43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h>
45
46#define EFI_DEBUG 1
47#define PFX "EFI: "
48
49int efi_enabled;
50EXPORT_SYMBOL(efi_enabled);
51
52struct efi efi;
53EXPORT_SYMBOL(efi);
54
55struct efi_memory_map memmap;
56
57struct efi efi_phys __initdata;
58static efi_system_table_t efi_systab __initdata;
59
60static int __init setup_noefi(char *arg)
61{
62 efi_enabled = 0;
63 return 0;
64}
65early_param("noefi", setup_noefi);
66
67static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
68{
69 return efi_call_virt2(get_time, tm, tc);
70}
71
72static efi_status_t virt_efi_set_time(efi_time_t *tm)
73{
74 return efi_call_virt1(set_time, tm);
75}
76
77static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
78 efi_bool_t *pending,
79 efi_time_t *tm)
80{
81 return efi_call_virt3(get_wakeup_time,
82 enabled, pending, tm);
83}
84
85static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
86{
87 return efi_call_virt2(set_wakeup_time,
88 enabled, tm);
89}
90
91static efi_status_t virt_efi_get_variable(efi_char16_t *name,
92 efi_guid_t *vendor,
93 u32 *attr,
94 unsigned long *data_size,
95 void *data)
96{
97 return efi_call_virt5(get_variable,
98 name, vendor, attr,
99 data_size, data);
100}
101
102static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
103 efi_char16_t *name,
104 efi_guid_t *vendor)
105{
106 return efi_call_virt3(get_next_variable,
107 name_size, name, vendor);
108}
109
110static efi_status_t virt_efi_set_variable(efi_char16_t *name,
111 efi_guid_t *vendor,
112 unsigned long attr,
113 unsigned long data_size,
114 void *data)
115{
116 return efi_call_virt5(set_variable,
117 name, vendor, attr,
118 data_size, data);
119}
120
121static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
122{
123 return efi_call_virt1(get_next_high_mono_count, count);
124}
125
126static void virt_efi_reset_system(int reset_type,
127 efi_status_t status,
128 unsigned long data_size,
129 efi_char16_t *data)
130{
131 efi_call_virt4(reset_system, reset_type, status,
132 data_size, data);
133}
134
135static efi_status_t virt_efi_set_virtual_address_map(
136 unsigned long memory_map_size,
137 unsigned long descriptor_size,
138 u32 descriptor_version,
139 efi_memory_desc_t *virtual_map)
140{
141 return efi_call_virt4(set_virtual_address_map,
142 memory_map_size, descriptor_size,
143 descriptor_version, virtual_map);
144}
145
146static efi_status_t __init phys_efi_set_virtual_address_map(
147 unsigned long memory_map_size,
148 unsigned long descriptor_size,
149 u32 descriptor_version,
150 efi_memory_desc_t *virtual_map)
151{
152 efi_status_t status;
153
154 efi_call_phys_prelog();
155 status = efi_call_phys4(efi_phys.set_virtual_address_map,
156 memory_map_size, descriptor_size,
157 descriptor_version, virtual_map);
158 efi_call_phys_epilog();
159 return status;
160}
161
162static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
163 efi_time_cap_t *tc)
164{
165 efi_status_t status;
166
167 efi_call_phys_prelog();
168 status = efi_call_phys2(efi_phys.get_time, tm, tc);
169 efi_call_phys_epilog();
170 return status;
171}
172
173int efi_set_rtc_mmss(unsigned long nowtime)
174{
175 int real_seconds, real_minutes;
176 efi_status_t status;
177 efi_time_t eft;
178 efi_time_cap_t cap;
179
180 status = efi.get_time(&eft, &cap);
181 if (status != EFI_SUCCESS) {
182 printk(KERN_ERR "Oops: efitime: can't read time!\n");
183 return -1;
184 }
185
186 real_seconds = nowtime % 60;
187 real_minutes = nowtime / 60;
188 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
189 real_minutes += 30;
190 real_minutes %= 60;
191 eft.minute = real_minutes;
192 eft.second = real_seconds;
193
194 status = efi.set_time(&eft);
195 if (status != EFI_SUCCESS) {
196 printk(KERN_ERR "Oops: efitime: can't write time!\n");
197 return -1;
198 }
199 return 0;
200}
201
202unsigned long efi_get_time(void)
203{
204 efi_status_t status;
205 efi_time_t eft;
206 efi_time_cap_t cap;
207
208 status = efi.get_time(&eft, &cap);
209 if (status != EFI_SUCCESS)
210 printk(KERN_ERR "Oops: efitime: can't read time!\n");
211
212 return mktime(eft.year, eft.month, eft.day, eft.hour,
213 eft.minute, eft.second);
214}
215
216#if EFI_DEBUG
217static void __init print_efi_memmap(void)
218{
219 efi_memory_desc_t *md;
220 void *p;
221 int i;
222
223 for (p = memmap.map, i = 0;
224 p < memmap.map_end;
225 p += memmap.desc_size, i++) {
226 md = p;
227 printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
228 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
229 i, md->type, md->attribute, md->phys_addr,
230 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
231 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
232 }
233}
234#endif /* EFI_DEBUG */
235
236void __init efi_init(void)
237{
238 efi_config_table_t *config_tables;
239 efi_runtime_services_t *runtime;
240 efi_char16_t *c16;
241 char vendor[100] = "unknown";
242 int i = 0;
243 void *tmp;
244
245#ifdef CONFIG_X86_32
246 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
247 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
248#else
249 efi_phys.systab = (efi_system_table_t *)
250 (boot_params.efi_info.efi_systab |
251 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
252 memmap.phys_map = (void *)
253 (boot_params.efi_info.efi_memmap |
254 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
255#endif
256 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
257 boot_params.efi_info.efi_memdesc_size;
258 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
259 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
260
261 efi.systab = early_ioremap((unsigned long)efi_phys.systab,
262 sizeof(efi_system_table_t));
263 if (efi.systab == NULL)
264 printk(KERN_ERR "Couldn't map the EFI system table!\n");
265 memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
266 early_iounmap(efi.systab, sizeof(efi_system_table_t));
267 efi.systab = &efi_systab;
268
269 /*
270 * Verify the EFI Table
271 */
272 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
273 printk(KERN_ERR "EFI system table signature incorrect!\n");
274 if ((efi.systab->hdr.revision >> 16) == 0)
275 printk(KERN_ERR "Warning: EFI system table version "
276 "%d.%02d, expected 1.00 or greater!\n",
277 efi.systab->hdr.revision >> 16,
278 efi.systab->hdr.revision & 0xffff);
279
280 /*
281 * Show what we know for posterity
282 */
283 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
284 if (c16) {
285 for (i = 0; i < sizeof(vendor) && *c16; ++i)
286 vendor[i] = *c16++;
287 vendor[i] = '\0';
288 } else
289 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
290 early_iounmap(tmp, 2);
291
292 printk(KERN_INFO "EFI v%u.%.02u by %s \n",
293 efi.systab->hdr.revision >> 16,
294 efi.systab->hdr.revision & 0xffff, vendor);
295
296 /*
297 * Let's see what config tables the firmware passed to us.
298 */
299 config_tables = early_ioremap(
300 efi.systab->tables,
301 efi.systab->nr_tables * sizeof(efi_config_table_t));
302 if (config_tables == NULL)
303 printk(KERN_ERR "Could not map EFI Configuration Table!\n");
304
305 printk(KERN_INFO);
306 for (i = 0; i < efi.systab->nr_tables; i++) {
307 if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
308 efi.mps = config_tables[i].table;
309 printk(" MPS=0x%lx ", config_tables[i].table);
310 } else if (!efi_guidcmp(config_tables[i].guid,
311 ACPI_20_TABLE_GUID)) {
312 efi.acpi20 = config_tables[i].table;
313 printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
314 } else if (!efi_guidcmp(config_tables[i].guid,
315 ACPI_TABLE_GUID)) {
316 efi.acpi = config_tables[i].table;
317 printk(" ACPI=0x%lx ", config_tables[i].table);
318 } else if (!efi_guidcmp(config_tables[i].guid,
319 SMBIOS_TABLE_GUID)) {
320 efi.smbios = config_tables[i].table;
321 printk(" SMBIOS=0x%lx ", config_tables[i].table);
322 } else if (!efi_guidcmp(config_tables[i].guid,
323 HCDP_TABLE_GUID)) {
324 efi.hcdp = config_tables[i].table;
325 printk(" HCDP=0x%lx ", config_tables[i].table);
326 } else if (!efi_guidcmp(config_tables[i].guid,
327 UGA_IO_PROTOCOL_GUID)) {
328 efi.uga = config_tables[i].table;
329 printk(" UGA=0x%lx ", config_tables[i].table);
330 }
331 }
332 printk("\n");
333 early_iounmap(config_tables,
334 efi.systab->nr_tables * sizeof(efi_config_table_t));
335
336 /*
337 * Check out the runtime services table. We need to map
338 * the runtime services table so that we can grab the physical
339 * address of several of the EFI runtime functions, needed to
340 * set the firmware into virtual mode.
341 */
342 runtime = early_ioremap((unsigned long)efi.systab->runtime,
343 sizeof(efi_runtime_services_t));
344 if (runtime != NULL) {
345 /*
346 * We will only need *early* access to the following
347 * two EFI runtime services before set_virtual_address_map
348 * is invoked.
349 */
350 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
351 efi_phys.set_virtual_address_map =
352 (efi_set_virtual_address_map_t *)
353 runtime->set_virtual_address_map;
354 /*
355 * Make efi_get_time can be called before entering
356 * virtual mode.
357 */
358 efi.get_time = phys_efi_get_time;
359 } else
360 printk(KERN_ERR "Could not map the EFI runtime service "
361 "table!\n");
362 early_iounmap(runtime, sizeof(efi_runtime_services_t));
363
364 /* Map the EFI memory map */
365 memmap.map = early_ioremap((unsigned long)memmap.phys_map,
366 memmap.nr_map * memmap.desc_size);
367 if (memmap.map == NULL)
368 printk(KERN_ERR "Could not map the EFI memory map!\n");
369 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
370 if (memmap.desc_size != sizeof(efi_memory_desc_t))
371 printk(KERN_WARNING "Kernel-defined memdesc"
372 "doesn't match the one from EFI!\n");
373
374 /* Setup for EFI runtime service */
375 reboot_type = BOOT_EFI;
376
377#if EFI_DEBUG
378 print_efi_memmap();
379#endif
380}
381
382static void __init runtime_code_page_mkexec(void)
383{
384 efi_memory_desc_t *md;
385 void *p;
386
387 if (!(__supported_pte_mask & _PAGE_NX))
388 return;
389
390 /* Make EFI runtime service code area executable */
391 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
392 md = p;
393
394 if (md->type != EFI_RUNTIME_SERVICES_CODE)
395 continue;
396
397 set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT);
398 }
399}
400
401/*
402 * This function will switch the EFI runtime services to virtual mode.
403 * Essentially, look through the EFI memmap and map every region that
404 * has the runtime attribute bit set in its memory descriptor and update
405 * that memory descriptor with the virtual address obtained from ioremap().
406 * This enables the runtime services to be called without having to
407 * thunk back into physical mode for every invocation.
408 */
409void __init efi_enter_virtual_mode(void)
410{
411 efi_memory_desc_t *md;
412 efi_status_t status;
413 unsigned long size;
414 u64 end, systab;
415 void *p, *va;
416
417 efi.systab = NULL;
418 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
419 md = p;
420 if (!(md->attribute & EFI_MEMORY_RUNTIME))
421 continue;
422
423 size = md->num_pages << EFI_PAGE_SHIFT;
424 end = md->phys_addr + size;
425
426 if ((end >> PAGE_SHIFT) <= max_pfn_mapped)
427 va = __va(md->phys_addr);
428 else
429 va = efi_ioremap(md->phys_addr, size);
430
431 if (md->attribute & EFI_MEMORY_WB)
432 set_memory_uc(md->virt_addr, size);
433
434 md->virt_addr = (u64) (unsigned long) va;
435
436 if (!va) {
437 printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
438 (unsigned long long)md->phys_addr);
439 continue;
440 }
441
442 systab = (u64) (unsigned long) efi_phys.systab;
443 if (md->phys_addr <= systab && systab < end) {
444 systab += md->virt_addr - md->phys_addr;
445 efi.systab = (efi_system_table_t *) (unsigned long) systab;
446 }
447 }
448
449 BUG_ON(!efi.systab);
450
451 status = phys_efi_set_virtual_address_map(
452 memmap.desc_size * memmap.nr_map,
453 memmap.desc_size,
454 memmap.desc_version,
455 memmap.phys_map);
456
457 if (status != EFI_SUCCESS) {
458 printk(KERN_ALERT "Unable to switch EFI into virtual mode "
459 "(status=%lx)!\n", status);
460 panic("EFI call to SetVirtualAddressMap() failed!");
461 }
462
463 /*
464 * Now that EFI is in virtual mode, update the function
465 * pointers in the runtime service table to the new virtual addresses.
466 *
467 * Call EFI services through wrapper functions.
468 */
469 efi.get_time = virt_efi_get_time;
470 efi.set_time = virt_efi_set_time;
471 efi.get_wakeup_time = virt_efi_get_wakeup_time;
472 efi.set_wakeup_time = virt_efi_set_wakeup_time;
473 efi.get_variable = virt_efi_get_variable;
474 efi.get_next_variable = virt_efi_get_next_variable;
475 efi.set_variable = virt_efi_set_variable;
476 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
477 efi.reset_system = virt_efi_reset_system;
478 efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
479 runtime_code_page_mkexec();
480 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
481 memmap.map = NULL;
482}
483
484/*
485 * Convenience functions to obtain memory types and attributes
486 */
487u32 efi_mem_type(unsigned long phys_addr)
488{
489 efi_memory_desc_t *md;
490 void *p;
491
492 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
493 md = p;
494 if ((md->phys_addr <= phys_addr) &&
495 (phys_addr < (md->phys_addr +
496 (md->num_pages << EFI_PAGE_SHIFT))))
497 return md->type;
498 }
499 return 0;
500}
501
502u64 efi_mem_attributes(unsigned long phys_addr)
503{
504 efi_memory_desc_t *md;
505 void *p;
506
507 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
508 md = p;
509 if ((md->phys_addr <= phys_addr) &&
510 (phys_addr < (md->phys_addr +
511 (md->num_pages << EFI_PAGE_SHIFT))))
512 return md->attribute;
513 }
514 return 0;
515}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index e2be78f49399..cb91f985b4a1 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -20,40 +20,15 @@
20 */ 20 */
21 21
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/init.h>
24#include <linux/mm.h>
25#include <linux/types.h> 23#include <linux/types.h>
26#include <linux/time.h>
27#include <linux/spinlock.h>
28#include <linux/bootmem.h>
29#include <linux/ioport.h> 24#include <linux/ioport.h>
30#include <linux/module.h>
31#include <linux/efi.h> 25#include <linux/efi.h>
32#include <linux/kexec.h>
33 26
34#include <asm/setup.h>
35#include <asm/io.h> 27#include <asm/io.h>
36#include <asm/page.h> 28#include <asm/page.h>
37#include <asm/pgtable.h> 29#include <asm/pgtable.h>
38#include <asm/processor.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
41 31
42#define EFI_DEBUG 0
43#define PFX "EFI: "
44
45extern efi_status_t asmlinkage efi_call_phys(void *, ...);
46
47struct efi efi;
48EXPORT_SYMBOL(efi);
49static struct efi efi_phys;
50struct efi_memory_map memmap;
51
52/*
53 * We require an early boot_ioremap mapping mechanism initially
54 */
55extern void * boot_ioremap(unsigned long, unsigned long);
56
57/* 32/*
58 * To make EFI call EFI runtime service in physical addressing mode we need 33 * To make EFI call EFI runtime service in physical addressing mode we need
59 * prelog/epilog before/after the invocation to disable interrupt, to 34 * prelog/epilog before/after the invocation to disable interrupt, to
@@ -62,16 +37,14 @@ extern void * boot_ioremap(unsigned long, unsigned long);
62 */ 37 */
63 38
64static unsigned long efi_rt_eflags; 39static unsigned long efi_rt_eflags;
65static DEFINE_SPINLOCK(efi_rt_lock);
66static pgd_t efi_bak_pg_dir_pointer[2]; 40static pgd_t efi_bak_pg_dir_pointer[2];
67 41
68static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) 42void efi_call_phys_prelog(void)
69{ 43{
70 unsigned long cr4; 44 unsigned long cr4;
71 unsigned long temp; 45 unsigned long temp;
72 struct Xgt_desc_struct gdt_descr; 46 struct desc_ptr gdt_descr;
73 47
74 spin_lock(&efi_rt_lock);
75 local_irq_save(efi_rt_eflags); 48 local_irq_save(efi_rt_eflags);
76 49
77 /* 50 /*
@@ -101,17 +74,17 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
101 /* 74 /*
102 * After the lock is released, the original page table is restored. 75 * After the lock is released, the original page table is restored.
103 */ 76 */
104 local_flush_tlb(); 77 __flush_tlb_all();
105 78
106 gdt_descr.address = __pa(get_cpu_gdt_table(0)); 79 gdt_descr.address = __pa(get_cpu_gdt_table(0));
107 gdt_descr.size = GDT_SIZE - 1; 80 gdt_descr.size = GDT_SIZE - 1;
108 load_gdt(&gdt_descr); 81 load_gdt(&gdt_descr);
109} 82}
110 83
111static void efi_call_phys_epilog(void) __releases(efi_rt_lock) 84void efi_call_phys_epilog(void)
112{ 85{
113 unsigned long cr4; 86 unsigned long cr4;
114 struct Xgt_desc_struct gdt_descr; 87 struct desc_ptr gdt_descr;
115 88
116 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); 89 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
117 gdt_descr.size = GDT_SIZE - 1; 90 gdt_descr.size = GDT_SIZE - 1;
@@ -132,586 +105,7 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
132 /* 105 /*
133 * After the lock is released, the original page table is restored. 106 * After the lock is released, the original page table is restored.
134 */ 107 */
135 local_flush_tlb(); 108 __flush_tlb_all();
136 109
137 local_irq_restore(efi_rt_eflags); 110 local_irq_restore(efi_rt_eflags);
138 spin_unlock(&efi_rt_lock);
139}
140
141static efi_status_t
142phys_efi_set_virtual_address_map(unsigned long memory_map_size,
143 unsigned long descriptor_size,
144 u32 descriptor_version,
145 efi_memory_desc_t *virtual_map)
146{
147 efi_status_t status;
148
149 efi_call_phys_prelog();
150 status = efi_call_phys(efi_phys.set_virtual_address_map,
151 memory_map_size, descriptor_size,
152 descriptor_version, virtual_map);
153 efi_call_phys_epilog();
154 return status;
155}
156
157static efi_status_t
158phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
159{
160 efi_status_t status;
161
162 efi_call_phys_prelog();
163 status = efi_call_phys(efi_phys.get_time, tm, tc);
164 efi_call_phys_epilog();
165 return status;
166}
167
168inline int efi_set_rtc_mmss(unsigned long nowtime)
169{
170 int real_seconds, real_minutes;
171 efi_status_t status;
172 efi_time_t eft;
173 efi_time_cap_t cap;
174
175 spin_lock(&efi_rt_lock);
176 status = efi.get_time(&eft, &cap);
177 spin_unlock(&efi_rt_lock);
178 if (status != EFI_SUCCESS)
179 panic("Ooops, efitime: can't read time!\n");
180 real_seconds = nowtime % 60;
181 real_minutes = nowtime / 60;
182
183 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
184 real_minutes += 30;
185 real_minutes %= 60;
186
187 eft.minute = real_minutes;
188 eft.second = real_seconds;
189
190 if (status != EFI_SUCCESS) {
191 printk("Ooops: efitime: can't read time!\n");
192 return -1;
193 }
194 return 0;
195}
196/*
197 * This is used during kernel init before runtime
198 * services have been remapped and also during suspend, therefore,
199 * we'll need to call both in physical and virtual modes.
200 */
201inline unsigned long efi_get_time(void)
202{
203 efi_status_t status;
204 efi_time_t eft;
205 efi_time_cap_t cap;
206
207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
215 if (status != EFI_SUCCESS)
216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
217
218 return mktime(eft.year, eft.month, eft.day, eft.hour,
219 eft.minute, eft.second);
220}
221
222int is_available_memory(efi_memory_desc_t * md)
223{
224 if (!(md->attribute & EFI_MEMORY_WB))
225 return 0;
226
227 switch (md->type) {
228 case EFI_LOADER_CODE:
229 case EFI_LOADER_DATA:
230 case EFI_BOOT_SERVICES_CODE:
231 case EFI_BOOT_SERVICES_DATA:
232 case EFI_CONVENTIONAL_MEMORY:
233 return 1;
234 }
235 return 0;
236}
237
238/*
239 * We need to map the EFI memory map again after paging_init().
240 */
241void __init efi_map_memmap(void)
242{
243 memmap.map = NULL;
244
245 memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
246 (memmap.nr_map * memmap.desc_size));
247 if (memmap.map == NULL)
248 printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
249
250 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
251}
252
253#if EFI_DEBUG
254static void __init print_efi_memmap(void)
255{
256 efi_memory_desc_t *md;
257 void *p;
258 int i;
259
260 for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
261 md = p;
262 printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
263 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
264 i, md->type, md->attribute, md->phys_addr,
265 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
266 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
267 }
268}
269#endif /* EFI_DEBUG */
270
271/*
272 * Walks the EFI memory map and calls CALLBACK once for each EFI
273 * memory descriptor that has memory that is available for kernel use.
274 */
275void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
276{
277 int prev_valid = 0;
278 struct range {
279 unsigned long start;
280 unsigned long end;
281 } uninitialized_var(prev), curr;
282 efi_memory_desc_t *md;
283 unsigned long start, end;
284 void *p;
285
286 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
287 md = p;
288
289 if ((md->num_pages == 0) || (!is_available_memory(md)))
290 continue;
291
292 curr.start = md->phys_addr;
293 curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT);
294
295 if (!prev_valid) {
296 prev = curr;
297 prev_valid = 1;
298 } else {
299 if (curr.start < prev.start)
300 printk(KERN_INFO PFX "Unordered memory map\n");
301 if (prev.end == curr.start)
302 prev.end = curr.end;
303 else {
304 start =
305 (unsigned long) (PAGE_ALIGN(prev.start));
306 end = (unsigned long) (prev.end & PAGE_MASK);
307 if ((end > start)
308 && (*callback) (start, end, arg) < 0)
309 return;
310 prev = curr;
311 }
312 }
313 }
314 if (prev_valid) {
315 start = (unsigned long) PAGE_ALIGN(prev.start);
316 end = (unsigned long) (prev.end & PAGE_MASK);
317 if (end > start)
318 (*callback) (start, end, arg);
319 }
320}
321
322void __init efi_init(void)
323{
324 efi_config_table_t *config_tables;
325 efi_runtime_services_t *runtime;
326 efi_char16_t *c16;
327 char vendor[100] = "unknown";
328 unsigned long num_config_tables;
329 int i = 0;
330
331 memset(&efi, 0, sizeof(efi) );
332 memset(&efi_phys, 0, sizeof(efi_phys));
333
334 efi_phys.systab =
335 (efi_system_table_t *)boot_params.efi_info.efi_systab;
336 memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
337 memmap.nr_map = boot_params.efi_info.efi_memmap_size/
338 boot_params.efi_info.efi_memdesc_size;
339 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
340 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
341
342 efi.systab = (efi_system_table_t *)
343 boot_ioremap((unsigned long) efi_phys.systab,
344 sizeof(efi_system_table_t));
345 /*
346 * Verify the EFI Table
347 */
348 if (efi.systab == NULL)
349 printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n");
350 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
351 printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n");
352 if ((efi.systab->hdr.revision >> 16) == 0)
353 printk(KERN_ERR PFX "Warning: EFI system table version "
354 "%d.%02d, expected 1.00 or greater\n",
355 efi.systab->hdr.revision >> 16,
356 efi.systab->hdr.revision & 0xffff);
357
358 /*
359 * Grab some details from the system table
360 */
361 num_config_tables = efi.systab->nr_tables;
362 config_tables = (efi_config_table_t *)efi.systab->tables;
363 runtime = efi.systab->runtime;
364
365 /*
366 * Show what we know for posterity
367 */
368 c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2);
369 if (c16) {
370 for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i)
371 vendor[i] = *c16++;
372 vendor[i] = '\0';
373 } else
374 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
375
376 printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n",
377 efi.systab->hdr.revision >> 16,
378 efi.systab->hdr.revision & 0xffff, vendor);
379
380 /*
381 * Let's see what config tables the firmware passed to us.
382 */
383 config_tables = (efi_config_table_t *)
384 boot_ioremap((unsigned long) config_tables,
385 num_config_tables * sizeof(efi_config_table_t));
386
387 if (config_tables == NULL)
388 printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n");
389
390 efi.mps = EFI_INVALID_TABLE_ADDR;
391 efi.acpi = EFI_INVALID_TABLE_ADDR;
392 efi.acpi20 = EFI_INVALID_TABLE_ADDR;
393 efi.smbios = EFI_INVALID_TABLE_ADDR;
394 efi.sal_systab = EFI_INVALID_TABLE_ADDR;
395 efi.boot_info = EFI_INVALID_TABLE_ADDR;
396 efi.hcdp = EFI_INVALID_TABLE_ADDR;
397 efi.uga = EFI_INVALID_TABLE_ADDR;
398
399 for (i = 0; i < num_config_tables; i++) {
400 if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) {
401 efi.mps = config_tables[i].table;
402 printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table);
403 } else
404 if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) {
405 efi.acpi20 = config_tables[i].table;
406 printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table);
407 } else
408 if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) {
409 efi.acpi = config_tables[i].table;
410 printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table);
411 } else
412 if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) {
413 efi.smbios = config_tables[i].table;
414 printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table);
415 } else
416 if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) {
417 efi.hcdp = config_tables[i].table;
418 printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table);
419 } else
420 if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) {
421 efi.uga = config_tables[i].table;
422 printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table);
423 }
424 }
425 printk("\n");
426
427 /*
428 * Check out the runtime services table. We need to map
429 * the runtime services table so that we can grab the physical
430 * address of several of the EFI runtime functions, needed to
431 * set the firmware into virtual mode.
432 */
433
434 runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long)
435 runtime,
436 sizeof(efi_runtime_services_t));
437 if (runtime != NULL) {
438 /*
439 * We will only need *early* access to the following
440 * two EFI runtime services before set_virtual_address_map
441 * is invoked.
442 */
443 efi_phys.get_time = (efi_get_time_t *) runtime->get_time;
444 efi_phys.set_virtual_address_map =
445 (efi_set_virtual_address_map_t *)
446 runtime->set_virtual_address_map;
447 } else
448 printk(KERN_ERR PFX "Could not map the runtime service table!\n");
449
450 /* Map the EFI memory map for use until paging_init() */
451 memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap,
452 boot_params.efi_info.efi_memmap_size);
453 if (memmap.map == NULL)
454 printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
455
456 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
457
458#if EFI_DEBUG
459 print_efi_memmap();
460#endif
461}
462
463static inline void __init check_range_for_systab(efi_memory_desc_t *md)
464{
465 if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
466 ((unsigned long)efi_phys.systab < md->phys_addr +
467 ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
468 unsigned long addr;
469
470 addr = md->virt_addr - md->phys_addr +
471 (unsigned long)efi_phys.systab;
472 efi.systab = (efi_system_table_t *)addr;
473 }
474}
475
476/*
477 * Wrap all the virtual calls in a way that forces the parameters on the stack.
478 */
479
480#define efi_call_virt(f, args...) \
481 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
482
483static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
484{
485 return efi_call_virt(get_time, tm, tc);
486}
487
488static efi_status_t virt_efi_set_time (efi_time_t *tm)
489{
490 return efi_call_virt(set_time, tm);
491}
492
493static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled,
494 efi_bool_t *pending,
495 efi_time_t *tm)
496{
497 return efi_call_virt(get_wakeup_time, enabled, pending, tm);
498}
499
500static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled,
501 efi_time_t *tm)
502{
503 return efi_call_virt(set_wakeup_time, enabled, tm);
504}
505
506static efi_status_t virt_efi_get_variable (efi_char16_t *name,
507 efi_guid_t *vendor, u32 *attr,
508 unsigned long *data_size, void *data)
509{
510 return efi_call_virt(get_variable, name, vendor, attr, data_size, data);
511}
512
513static efi_status_t virt_efi_get_next_variable (unsigned long *name_size,
514 efi_char16_t *name,
515 efi_guid_t *vendor)
516{
517 return efi_call_virt(get_next_variable, name_size, name, vendor);
518}
519
520static efi_status_t virt_efi_set_variable (efi_char16_t *name,
521 efi_guid_t *vendor,
522 unsigned long attr,
523 unsigned long data_size, void *data)
524{
525 return efi_call_virt(set_variable, name, vendor, attr, data_size, data);
526}
527
528static efi_status_t virt_efi_get_next_high_mono_count (u32 *count)
529{
530 return efi_call_virt(get_next_high_mono_count, count);
531}
532
533static void virt_efi_reset_system (int reset_type, efi_status_t status,
534 unsigned long data_size,
535 efi_char16_t *data)
536{
537 efi_call_virt(reset_system, reset_type, status, data_size, data);
538}
539
540/*
541 * This function will switch the EFI runtime services to virtual mode.
542 * Essentially, look through the EFI memmap and map every region that
543 * has the runtime attribute bit set in its memory descriptor and update
544 * that memory descriptor with the virtual address obtained from ioremap().
545 * This enables the runtime services to be called without having to
546 * thunk back into physical mode for every invocation.
547 */
548
549void __init efi_enter_virtual_mode(void)
550{
551 efi_memory_desc_t *md;
552 efi_status_t status;
553 void *p;
554
555 efi.systab = NULL;
556
557 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
558 md = p;
559
560 if (!(md->attribute & EFI_MEMORY_RUNTIME))
561 continue;
562
563 md->virt_addr = (unsigned long)ioremap(md->phys_addr,
564 md->num_pages << EFI_PAGE_SHIFT);
565 if (!(unsigned long)md->virt_addr) {
566 printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
567 (unsigned long)md->phys_addr);
568 }
569 /* update the virtual address of the EFI system table */
570 check_range_for_systab(md);
571 }
572
573 BUG_ON(!efi.systab);
574
575 status = phys_efi_set_virtual_address_map(
576 memmap.desc_size * memmap.nr_map,
577 memmap.desc_size,
578 memmap.desc_version,
579 memmap.phys_map);
580
581 if (status != EFI_SUCCESS) {
582 printk (KERN_ALERT "You are screwed! "
583 "Unable to switch EFI into virtual mode "
584 "(status=%lx)\n", status);
585 panic("EFI call to SetVirtualAddressMap() failed!");
586 }
587
588 /*
589 * Now that EFI is in virtual mode, update the function
590 * pointers in the runtime service table to the new virtual addresses.
591 */
592
593 efi.get_time = virt_efi_get_time;
594 efi.set_time = virt_efi_set_time;
595 efi.get_wakeup_time = virt_efi_get_wakeup_time;
596 efi.set_wakeup_time = virt_efi_set_wakeup_time;
597 efi.get_variable = virt_efi_get_variable;
598 efi.get_next_variable = virt_efi_get_next_variable;
599 efi.set_variable = virt_efi_set_variable;
600 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
601 efi.reset_system = virt_efi_reset_system;
602}
603
604void __init
605efi_initialize_iomem_resources(struct resource *code_resource,
606 struct resource *data_resource,
607 struct resource *bss_resource)
608{
609 struct resource *res;
610 efi_memory_desc_t *md;
611 void *p;
612
613 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
614 md = p;
615
616 if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
617 0x100000000ULL)
618 continue;
619 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
620 switch (md->type) {
621 case EFI_RESERVED_TYPE:
622 res->name = "Reserved Memory";
623 break;
624 case EFI_LOADER_CODE:
625 res->name = "Loader Code";
626 break;
627 case EFI_LOADER_DATA:
628 res->name = "Loader Data";
629 break;
630 case EFI_BOOT_SERVICES_DATA:
631 res->name = "BootServices Data";
632 break;
633 case EFI_BOOT_SERVICES_CODE:
634 res->name = "BootServices Code";
635 break;
636 case EFI_RUNTIME_SERVICES_CODE:
637 res->name = "Runtime Service Code";
638 break;
639 case EFI_RUNTIME_SERVICES_DATA:
640 res->name = "Runtime Service Data";
641 break;
642 case EFI_CONVENTIONAL_MEMORY:
643 res->name = "Conventional Memory";
644 break;
645 case EFI_UNUSABLE_MEMORY:
646 res->name = "Unusable Memory";
647 break;
648 case EFI_ACPI_RECLAIM_MEMORY:
649 res->name = "ACPI Reclaim";
650 break;
651 case EFI_ACPI_MEMORY_NVS:
652 res->name = "ACPI NVS";
653 break;
654 case EFI_MEMORY_MAPPED_IO:
655 res->name = "Memory Mapped IO";
656 break;
657 case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
658 res->name = "Memory Mapped IO Port Space";
659 break;
660 default:
661 res->name = "Reserved";
662 break;
663 }
664 res->start = md->phys_addr;
665 res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
666 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
667 if (request_resource(&iomem_resource, res) < 0)
668 printk(KERN_ERR PFX "Failed to allocate res %s : "
669 "0x%llx-0x%llx\n", res->name,
670 (unsigned long long)res->start,
671 (unsigned long long)res->end);
672 /*
673 * We don't know which region contains kernel data so we try
674 * it repeatedly and let the resource manager test it.
675 */
676 if (md->type == EFI_CONVENTIONAL_MEMORY) {
677 request_resource(res, code_resource);
678 request_resource(res, data_resource);
679 request_resource(res, bss_resource);
680#ifdef CONFIG_KEXEC
681 request_resource(res, &crashk_res);
682#endif
683 }
684 }
685}
686
687/*
688 * Convenience functions to obtain memory types and attributes
689 */
690
691u32 efi_mem_type(unsigned long phys_addr)
692{
693 efi_memory_desc_t *md;
694 void *p;
695
696 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
697 md = p;
698 if ((md->phys_addr <= phys_addr) && (phys_addr <
699 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
700 return md->type;
701 }
702 return 0;
703}
704
705u64 efi_mem_attributes(unsigned long phys_addr)
706{
707 efi_memory_desc_t *md;
708 void *p;
709
710 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
711 md = p;
712 if ((md->phys_addr <= phys_addr) && (phys_addr <
713 (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
714 return md->attribute;
715 }
716 return 0;
717} 111}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
new file mode 100644
index 000000000000..09d5c2330934
--- /dev/null
+++ b/arch/x86/kernel/efi_64.c
@@ -0,0 +1,134 @@
1/*
2 * x86_64 specific EFI support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 2005-2008 Intel Co.
6 * Fenghua Yu <fenghua.yu@intel.com>
7 * Bibo Mao <bibo.mao@intel.com>
8 * Chandramouli Narayanan <mouli@linux.intel.com>
9 * Huang Ying <ying.huang@intel.com>
10 *
11 * Code to convert EFI to E820 map has been implemented in elilo bootloader
12 * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
13 * is setup appropriately for EFI runtime code.
14 * - mouli 06/14/2007.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/mm.h>
21#include <linux/types.h>
22#include <linux/spinlock.h>
23#include <linux/bootmem.h>
24#include <linux/ioport.h>
25#include <linux/module.h>
26#include <linux/efi.h>
27#include <linux/uaccess.h>
28#include <linux/io.h>
29#include <linux/reboot.h>
30
31#include <asm/setup.h>
32#include <asm/page.h>
33#include <asm/e820.h>
34#include <asm/pgtable.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
37#include <asm/efi.h>
38
39static pgd_t save_pgd __initdata;
40static unsigned long efi_flags __initdata;
41
42static void __init early_mapping_set_exec(unsigned long start,
43 unsigned long end,
44 int executable)
45{
46 pte_t *kpte;
47 unsigned int level;
48
49 while (start < end) {
50 kpte = lookup_address((unsigned long)__va(start), &level);
51 BUG_ON(!kpte);
52 if (executable)
53 set_pte(kpte, pte_mkexec(*kpte));
54 else
55 set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \
56 __supported_pte_mask));
57 if (level == PG_LEVEL_4K)
58 start = (start + PAGE_SIZE) & PAGE_MASK;
59 else
60 start = (start + PMD_SIZE) & PMD_MASK;
61 }
62}
63
64static void __init early_runtime_code_mapping_set_exec(int executable)
65{
66 efi_memory_desc_t *md;
67 void *p;
68
69 if (!(__supported_pte_mask & _PAGE_NX))
70 return;
71
72 /* Make EFI runtime service code area executable */
73 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
74 md = p;
75 if (md->type == EFI_RUNTIME_SERVICES_CODE) {
76 unsigned long end;
77 end = md->phys_addr + (md->num_pages << PAGE_SHIFT);
78 early_mapping_set_exec(md->phys_addr, end, executable);
79 }
80 }
81}
82
83void __init efi_call_phys_prelog(void)
84{
85 unsigned long vaddress;
86
87 local_irq_save(efi_flags);
88 early_runtime_code_mapping_set_exec(1);
89 vaddress = (unsigned long)__va(0x0UL);
90 save_pgd = *pgd_offset_k(0x0UL);
91 set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
92 __flush_tlb_all();
93}
94
95void __init efi_call_phys_epilog(void)
96{
97 /*
98 * After the lock is released, the original page table is restored.
99 */
100 set_pgd(pgd_offset_k(0x0UL), save_pgd);
101 early_runtime_code_mapping_set_exec(0);
102 __flush_tlb_all();
103 local_irq_restore(efi_flags);
104}
105
106void __init efi_reserve_bootmem(void)
107{
108 reserve_bootmem_generic((unsigned long)memmap.phys_map,
109 memmap.nr_map * memmap.desc_size);
110}
111
112void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
113{
114 static unsigned pages_mapped;
115 unsigned i, pages;
116
117 /* phys_addr and size must be page aligned */
118 if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK))
119 return NULL;
120
121 pages = size >> PAGE_SHIFT;
122 if (pages_mapped + pages > MAX_EFI_IO_PAGES)
123 return NULL;
124
125 for (i = 0; i < pages; i++) {
126 __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
127 phys_addr, PAGE_KERNEL);
128 phys_addr += PAGE_SIZE;
129 pages_mapped++;
130 }
131
132 return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
133 (pages_mapped - pages));
134}
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
new file mode 100644
index 000000000000..99b47d48c9f4
--- /dev/null
+++ b/arch/x86/kernel/efi_stub_64.S
@@ -0,0 +1,109 @@
1/*
2 * Function calling ABI conversion from Linux to EFI for x86_64
3 *
4 * Copyright (C) 2007 Intel Corp
5 * Bibo Mao <bibo.mao@intel.com>
6 * Huang Ying <ying.huang@intel.com>
7 */
8
9#include <linux/linkage.h>
10
11#define SAVE_XMM \
12 mov %rsp, %rax; \
13 subq $0x70, %rsp; \
14 and $~0xf, %rsp; \
15 mov %rax, (%rsp); \
16 mov %cr0, %rax; \
17 clts; \
18 mov %rax, 0x8(%rsp); \
19 movaps %xmm0, 0x60(%rsp); \
20 movaps %xmm1, 0x50(%rsp); \
21 movaps %xmm2, 0x40(%rsp); \
22 movaps %xmm3, 0x30(%rsp); \
23 movaps %xmm4, 0x20(%rsp); \
24 movaps %xmm5, 0x10(%rsp)
25
26#define RESTORE_XMM \
27 movaps 0x60(%rsp), %xmm0; \
28 movaps 0x50(%rsp), %xmm1; \
29 movaps 0x40(%rsp), %xmm2; \
30 movaps 0x30(%rsp), %xmm3; \
31 movaps 0x20(%rsp), %xmm4; \
32 movaps 0x10(%rsp), %xmm5; \
33 mov 0x8(%rsp), %rsi; \
34 mov %rsi, %cr0; \
35 mov (%rsp), %rsp
36
37ENTRY(efi_call0)
38 SAVE_XMM
39 subq $32, %rsp
40 call *%rdi
41 addq $32, %rsp
42 RESTORE_XMM
43 ret
44
45ENTRY(efi_call1)
46 SAVE_XMM
47 subq $32, %rsp
48 mov %rsi, %rcx
49 call *%rdi
50 addq $32, %rsp
51 RESTORE_XMM
52 ret
53
54ENTRY(efi_call2)
55 SAVE_XMM
56 subq $32, %rsp
57 mov %rsi, %rcx
58 call *%rdi
59 addq $32, %rsp
60 RESTORE_XMM
61 ret
62
63ENTRY(efi_call3)
64 SAVE_XMM
65 subq $32, %rsp
66 mov %rcx, %r8
67 mov %rsi, %rcx
68 call *%rdi
69 addq $32, %rsp
70 RESTORE_XMM
71 ret
72
73ENTRY(efi_call4)
74 SAVE_XMM
75 subq $32, %rsp
76 mov %r8, %r9
77 mov %rcx, %r8
78 mov %rsi, %rcx
79 call *%rdi
80 addq $32, %rsp
81 RESTORE_XMM
82 ret
83
84ENTRY(efi_call5)
85 SAVE_XMM
86 subq $48, %rsp
87 mov %r9, 32(%rsp)
88 mov %r8, %r9
89 mov %rcx, %r8
90 mov %rsi, %rcx
91 call *%rdi
92 addq $48, %rsp
93 RESTORE_XMM
94 ret
95
96ENTRY(efi_call6)
97 SAVE_XMM
98 mov (%rsp), %rax
99 mov 8(%rax), %rax
100 subq $48, %rsp
101 mov %r9, 32(%rsp)
102 mov %rax, 40(%rsp)
103 mov %r8, %r9
104 mov %rcx, %r8
105 mov %rsi, %rcx
106 call *%rdi
107 addq $48, %rsp
108 RESTORE_XMM
109 ret
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index dc7f938e5015..be5c31d04884 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -58,7 +58,7 @@
58 * for paravirtualization. The following will never clobber any registers: 58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret") 59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") 60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). 61 * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit").
62 * 62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must 63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). 64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
@@ -283,12 +283,12 @@ END(resume_kernel)
283 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 283 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
284 284
285 # sysenter call handler stub 285 # sysenter call handler stub
286ENTRY(sysenter_entry) 286ENTRY(ia32_sysenter_target)
287 CFI_STARTPROC simple 287 CFI_STARTPROC simple
288 CFI_SIGNAL_FRAME 288 CFI_SIGNAL_FRAME
289 CFI_DEF_CFA esp, 0 289 CFI_DEF_CFA esp, 0
290 CFI_REGISTER esp, ebp 290 CFI_REGISTER esp, ebp
291 movl TSS_sysenter_esp0(%esp),%esp 291 movl TSS_sysenter_sp0(%esp),%esp
292sysenter_past_esp: 292sysenter_past_esp:
293 /* 293 /*
294 * No need to follow this irqs on/off section: the syscall 294 * No need to follow this irqs on/off section: the syscall
@@ -351,7 +351,7 @@ sysenter_past_esp:
351 xorl %ebp,%ebp 351 xorl %ebp,%ebp
352 TRACE_IRQS_ON 352 TRACE_IRQS_ON
3531: mov PT_FS(%esp), %fs 3531: mov PT_FS(%esp), %fs
354 ENABLE_INTERRUPTS_SYSEXIT 354 ENABLE_INTERRUPTS_SYSCALL_RET
355 CFI_ENDPROC 355 CFI_ENDPROC
356.pushsection .fixup,"ax" 356.pushsection .fixup,"ax"
3572: movl $0,PT_FS(%esp) 3572: movl $0,PT_FS(%esp)
@@ -360,7 +360,7 @@ sysenter_past_esp:
360 .align 4 360 .align 4
361 .long 1b,2b 361 .long 1b,2b
362.popsection 362.popsection
363ENDPROC(sysenter_entry) 363ENDPROC(ia32_sysenter_target)
364 364
365 # system call handler stub 365 # system call handler stub
366ENTRY(system_call) 366ENTRY(system_call)
@@ -583,7 +583,7 @@ END(syscall_badsys)
583 * Build the entry stubs and pointer table with 583 * Build the entry stubs and pointer table with
584 * some assembler magic. 584 * some assembler magic.
585 */ 585 */
586.data 586.section .rodata,"a"
587ENTRY(interrupt) 587ENTRY(interrupt)
588.text 588.text
589 589
@@ -743,7 +743,7 @@ END(device_not_available)
743 * that sets up the real kernel stack. Check here, since we can't 743 * that sets up the real kernel stack. Check here, since we can't
744 * allow the wrong stack to be used. 744 * allow the wrong stack to be used.
745 * 745 *
746 * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have 746 * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
747 * already pushed 3 words if it hits on the sysenter instruction: 747 * already pushed 3 words if it hits on the sysenter instruction:
748 * eflags, cs and eip. 748 * eflags, cs and eip.
749 * 749 *
@@ -755,7 +755,7 @@ END(device_not_available)
755 cmpw $__KERNEL_CS,4(%esp); \ 755 cmpw $__KERNEL_CS,4(%esp); \
756 jne ok; \ 756 jne ok; \
757label: \ 757label: \
758 movl TSS_sysenter_esp0+offset(%esp),%esp; \ 758 movl TSS_sysenter_sp0+offset(%esp),%esp; \
759 CFI_DEF_CFA esp, 0; \ 759 CFI_DEF_CFA esp, 0; \
760 CFI_UNDEFINED eip; \ 760 CFI_UNDEFINED eip; \
761 pushfl; \ 761 pushfl; \
@@ -768,7 +768,7 @@ label: \
768 768
769KPROBE_ENTRY(debug) 769KPROBE_ENTRY(debug)
770 RING0_INT_FRAME 770 RING0_INT_FRAME
771 cmpl $sysenter_entry,(%esp) 771 cmpl $ia32_sysenter_target,(%esp)
772 jne debug_stack_correct 772 jne debug_stack_correct
773 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) 773 FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
774debug_stack_correct: 774debug_stack_correct:
@@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi)
799 popl %eax 799 popl %eax
800 CFI_ADJUST_CFA_OFFSET -4 800 CFI_ADJUST_CFA_OFFSET -4
801 je nmi_espfix_stack 801 je nmi_espfix_stack
802 cmpl $sysenter_entry,(%esp) 802 cmpl $ia32_sysenter_target,(%esp)
803 je nmi_stack_fixup 803 je nmi_stack_fixup
804 pushl %eax 804 pushl %eax
805 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
@@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi)
812 popl %eax 812 popl %eax
813 CFI_ADJUST_CFA_OFFSET -4 813 CFI_ADJUST_CFA_OFFSET -4
814 jae nmi_stack_correct 814 jae nmi_stack_correct
815 cmpl $sysenter_entry,12(%esp) 815 cmpl $ia32_sysenter_target,12(%esp)
816 je nmi_debug_stack_check 816 je nmi_debug_stack_check
817nmi_stack_correct: 817nmi_stack_correct:
818 /* We have a RING0_INT_FRAME here */ 818 /* We have a RING0_INT_FRAME here */
@@ -882,10 +882,10 @@ ENTRY(native_iret)
882.previous 882.previous
883END(native_iret) 883END(native_iret)
884 884
885ENTRY(native_irq_enable_sysexit) 885ENTRY(native_irq_enable_syscall_ret)
886 sti 886 sti
887 sysexit 887 sysexit
888END(native_irq_enable_sysexit) 888END(native_irq_enable_syscall_ret)
889#endif 889#endif
890 890
891KPROBE_ENTRY(int3) 891KPROBE_ENTRY(int3)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3a058bb16409..c7341e81941c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -50,6 +50,7 @@
50#include <asm/hw_irq.h> 50#include <asm/hw_irq.h>
51#include <asm/page.h> 51#include <asm/page.h>
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h>
53 54
54 .code64 55 .code64
55 56
@@ -57,6 +58,13 @@
57#define retint_kernel retint_restore_args 58#define retint_kernel retint_restore_args
58#endif 59#endif
59 60
61#ifdef CONFIG_PARAVIRT
62ENTRY(native_irq_enable_syscall_ret)
63 movq %gs:pda_oldrsp,%rsp
64 swapgs
65 sysretq
66#endif /* CONFIG_PARAVIRT */
67
60 68
61.macro TRACE_IRQS_IRETQ offset=ARGOFFSET 69.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
62#ifdef CONFIG_TRACE_IRQFLAGS 70#ifdef CONFIG_TRACE_IRQFLAGS
@@ -216,14 +224,21 @@ ENTRY(system_call)
216 CFI_DEF_CFA rsp,PDA_STACKOFFSET 224 CFI_DEF_CFA rsp,PDA_STACKOFFSET
217 CFI_REGISTER rip,rcx 225 CFI_REGISTER rip,rcx
218 /*CFI_REGISTER rflags,r11*/ 226 /*CFI_REGISTER rflags,r11*/
219 swapgs 227 SWAPGS_UNSAFE_STACK
228 /*
229 * A hypervisor implementation might want to use a label
230 * after the swapgs, so that it can do the swapgs
231 * for the guest and jump here on syscall.
232 */
233ENTRY(system_call_after_swapgs)
234
220 movq %rsp,%gs:pda_oldrsp 235 movq %rsp,%gs:pda_oldrsp
221 movq %gs:pda_kernelstack,%rsp 236 movq %gs:pda_kernelstack,%rsp
222 /* 237 /*
223 * No need to follow this irqs off/on section - it's straight 238 * No need to follow this irqs off/on section - it's straight
224 * and short: 239 * and short:
225 */ 240 */
226 sti 241 ENABLE_INTERRUPTS(CLBR_NONE)
227 SAVE_ARGS 8,1 242 SAVE_ARGS 8,1
228 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 243 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
229 movq %rcx,RIP-ARGOFFSET(%rsp) 244 movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -246,7 +261,7 @@ ret_from_sys_call:
246sysret_check: 261sysret_check:
247 LOCKDEP_SYS_EXIT 262 LOCKDEP_SYS_EXIT
248 GET_THREAD_INFO(%rcx) 263 GET_THREAD_INFO(%rcx)
249 cli 264 DISABLE_INTERRUPTS(CLBR_NONE)
250 TRACE_IRQS_OFF 265 TRACE_IRQS_OFF
251 movl threadinfo_flags(%rcx),%edx 266 movl threadinfo_flags(%rcx),%edx
252 andl %edi,%edx 267 andl %edi,%edx
@@ -260,9 +275,7 @@ sysret_check:
260 CFI_REGISTER rip,rcx 275 CFI_REGISTER rip,rcx
261 RESTORE_ARGS 0,-ARG_SKIP,1 276 RESTORE_ARGS 0,-ARG_SKIP,1
262 /*CFI_REGISTER rflags,r11*/ 277 /*CFI_REGISTER rflags,r11*/
263 movq %gs:pda_oldrsp,%rsp 278 ENABLE_INTERRUPTS_SYSCALL_RET
264 swapgs
265 sysretq
266 279
267 CFI_RESTORE_STATE 280 CFI_RESTORE_STATE
268 /* Handle reschedules */ 281 /* Handle reschedules */
@@ -271,7 +284,7 @@ sysret_careful:
271 bt $TIF_NEED_RESCHED,%edx 284 bt $TIF_NEED_RESCHED,%edx
272 jnc sysret_signal 285 jnc sysret_signal
273 TRACE_IRQS_ON 286 TRACE_IRQS_ON
274 sti 287 ENABLE_INTERRUPTS(CLBR_NONE)
275 pushq %rdi 288 pushq %rdi
276 CFI_ADJUST_CFA_OFFSET 8 289 CFI_ADJUST_CFA_OFFSET 8
277 call schedule 290 call schedule
@@ -282,8 +295,8 @@ sysret_careful:
282 /* Handle a signal */ 295 /* Handle a signal */
283sysret_signal: 296sysret_signal:
284 TRACE_IRQS_ON 297 TRACE_IRQS_ON
285 sti 298 ENABLE_INTERRUPTS(CLBR_NONE)
286 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx 299 testl $_TIF_DO_NOTIFY_MASK,%edx
287 jz 1f 300 jz 1f
288 301
289 /* Really a signal */ 302 /* Really a signal */
@@ -295,7 +308,7 @@ sysret_signal:
2951: movl $_TIF_NEED_RESCHED,%edi 3081: movl $_TIF_NEED_RESCHED,%edi
296 /* Use IRET because user could have changed frame. This 309 /* Use IRET because user could have changed frame. This
297 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 310 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
298 cli 311 DISABLE_INTERRUPTS(CLBR_NONE)
299 TRACE_IRQS_OFF 312 TRACE_IRQS_OFF
300 jmp int_with_check 313 jmp int_with_check
301 314
@@ -327,7 +340,7 @@ tracesys:
327 */ 340 */
328 .globl int_ret_from_sys_call 341 .globl int_ret_from_sys_call
329int_ret_from_sys_call: 342int_ret_from_sys_call:
330 cli 343 DISABLE_INTERRUPTS(CLBR_NONE)
331 TRACE_IRQS_OFF 344 TRACE_IRQS_OFF
332 testl $3,CS-ARGOFFSET(%rsp) 345 testl $3,CS-ARGOFFSET(%rsp)
333 je retint_restore_args 346 je retint_restore_args
@@ -349,20 +362,20 @@ int_careful:
349 bt $TIF_NEED_RESCHED,%edx 362 bt $TIF_NEED_RESCHED,%edx
350 jnc int_very_careful 363 jnc int_very_careful
351 TRACE_IRQS_ON 364 TRACE_IRQS_ON
352 sti 365 ENABLE_INTERRUPTS(CLBR_NONE)
353 pushq %rdi 366 pushq %rdi
354 CFI_ADJUST_CFA_OFFSET 8 367 CFI_ADJUST_CFA_OFFSET 8
355 call schedule 368 call schedule
356 popq %rdi 369 popq %rdi
357 CFI_ADJUST_CFA_OFFSET -8 370 CFI_ADJUST_CFA_OFFSET -8
358 cli 371 DISABLE_INTERRUPTS(CLBR_NONE)
359 TRACE_IRQS_OFF 372 TRACE_IRQS_OFF
360 jmp int_with_check 373 jmp int_with_check
361 374
362 /* handle signals and tracing -- both require a full stack frame */ 375 /* handle signals and tracing -- both require a full stack frame */
363int_very_careful: 376int_very_careful:
364 TRACE_IRQS_ON 377 TRACE_IRQS_ON
365 sti 378 ENABLE_INTERRUPTS(CLBR_NONE)
366 SAVE_REST 379 SAVE_REST
367 /* Check for syscall exit trace */ 380 /* Check for syscall exit trace */
368 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 381 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -377,7 +390,7 @@ int_very_careful:
377 jmp int_restore_rest 390 jmp int_restore_rest
378 391
379int_signal: 392int_signal:
380 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx 393 testl $_TIF_DO_NOTIFY_MASK,%edx
381 jz 1f 394 jz 1f
382 movq %rsp,%rdi # &ptregs -> arg1 395 movq %rsp,%rdi # &ptregs -> arg1
383 xorl %esi,%esi # oldset -> arg2 396 xorl %esi,%esi # oldset -> arg2
@@ -385,7 +398,7 @@ int_signal:
3851: movl $_TIF_NEED_RESCHED,%edi 3981: movl $_TIF_NEED_RESCHED,%edi
386int_restore_rest: 399int_restore_rest:
387 RESTORE_REST 400 RESTORE_REST
388 cli 401 DISABLE_INTERRUPTS(CLBR_NONE)
389 TRACE_IRQS_OFF 402 TRACE_IRQS_OFF
390 jmp int_with_check 403 jmp int_with_check
391 CFI_ENDPROC 404 CFI_ENDPROC
@@ -506,7 +519,7 @@ END(stub_rt_sigreturn)
506 CFI_DEF_CFA_REGISTER rbp 519 CFI_DEF_CFA_REGISTER rbp
507 testl $3,CS(%rdi) 520 testl $3,CS(%rdi)
508 je 1f 521 je 1f
509 swapgs 522 SWAPGS
510 /* irqcount is used to check if a CPU is already on an interrupt 523 /* irqcount is used to check if a CPU is already on an interrupt
511 stack or not. While this is essentially redundant with preempt_count 524 stack or not. While this is essentially redundant with preempt_count
512 it is a little cheaper to use a separate counter in the PDA 525 it is a little cheaper to use a separate counter in the PDA
@@ -527,7 +540,7 @@ ENTRY(common_interrupt)
527 interrupt do_IRQ 540 interrupt do_IRQ
528 /* 0(%rsp): oldrsp-ARGOFFSET */ 541 /* 0(%rsp): oldrsp-ARGOFFSET */
529ret_from_intr: 542ret_from_intr:
530 cli 543 DISABLE_INTERRUPTS(CLBR_NONE)
531 TRACE_IRQS_OFF 544 TRACE_IRQS_OFF
532 decl %gs:pda_irqcount 545 decl %gs:pda_irqcount
533 leaveq 546 leaveq
@@ -556,64 +569,76 @@ retint_swapgs: /* return to user-space */
556 /* 569 /*
557 * The iretq could re-enable interrupts: 570 * The iretq could re-enable interrupts:
558 */ 571 */
559 cli 572 DISABLE_INTERRUPTS(CLBR_ANY)
560 TRACE_IRQS_IRETQ 573 TRACE_IRQS_IRETQ
561 swapgs 574 SWAPGS
562 jmp restore_args 575 jmp restore_args
563 576
564retint_restore_args: /* return to kernel space */ 577retint_restore_args: /* return to kernel space */
565 cli 578 DISABLE_INTERRUPTS(CLBR_ANY)
566 /* 579 /*
567 * The iretq could re-enable interrupts: 580 * The iretq could re-enable interrupts:
568 */ 581 */
569 TRACE_IRQS_IRETQ 582 TRACE_IRQS_IRETQ
570restore_args: 583restore_args:
571 RESTORE_ARGS 0,8,0 584 RESTORE_ARGS 0,8,0
572iret_label: 585#ifdef CONFIG_PARAVIRT
586 INTERRUPT_RETURN
587#endif
588ENTRY(native_iret)
573 iretq 589 iretq
574 590
575 .section __ex_table,"a" 591 .section __ex_table,"a"
576 .quad iret_label,bad_iret 592 .quad native_iret, bad_iret
577 .previous 593 .previous
578 .section .fixup,"ax" 594 .section .fixup,"ax"
579 /* force a signal here? this matches i386 behaviour */
580 /* running with kernel gs */
581bad_iret: 595bad_iret:
582 movq $11,%rdi /* SIGSEGV */ 596 /*
583 TRACE_IRQS_ON 597 * The iret traps when the %cs or %ss being restored is bogus.
584 sti 598 * We've lost the original trap vector and error code.
585 jmp do_exit 599 * #GPF is the most likely one to get for an invalid selector.
586 .previous 600 * So pretend we completed the iret and took the #GPF in user mode.
587 601 *
602 * We are now running with the kernel GS after exception recovery.
603 * But error_entry expects us to have user GS to match the user %cs,
604 * so swap back.
605 */
606 pushq $0
607
608 SWAPGS
609 jmp general_protection
610
611 .previous
612
588 /* edi: workmask, edx: work */ 613 /* edi: workmask, edx: work */
589retint_careful: 614retint_careful:
590 CFI_RESTORE_STATE 615 CFI_RESTORE_STATE
591 bt $TIF_NEED_RESCHED,%edx 616 bt $TIF_NEED_RESCHED,%edx
592 jnc retint_signal 617 jnc retint_signal
593 TRACE_IRQS_ON 618 TRACE_IRQS_ON
594 sti 619 ENABLE_INTERRUPTS(CLBR_NONE)
595 pushq %rdi 620 pushq %rdi
596 CFI_ADJUST_CFA_OFFSET 8 621 CFI_ADJUST_CFA_OFFSET 8
597 call schedule 622 call schedule
598 popq %rdi 623 popq %rdi
599 CFI_ADJUST_CFA_OFFSET -8 624 CFI_ADJUST_CFA_OFFSET -8
600 GET_THREAD_INFO(%rcx) 625 GET_THREAD_INFO(%rcx)
601 cli 626 DISABLE_INTERRUPTS(CLBR_NONE)
602 TRACE_IRQS_OFF 627 TRACE_IRQS_OFF
603 jmp retint_check 628 jmp retint_check
604 629
605retint_signal: 630retint_signal:
606 testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx 631 testl $_TIF_DO_NOTIFY_MASK,%edx
607 jz retint_swapgs 632 jz retint_swapgs
608 TRACE_IRQS_ON 633 TRACE_IRQS_ON
609 sti 634 ENABLE_INTERRUPTS(CLBR_NONE)
610 SAVE_REST 635 SAVE_REST
611 movq $-1,ORIG_RAX(%rsp) 636 movq $-1,ORIG_RAX(%rsp)
612 xorl %esi,%esi # oldset 637 xorl %esi,%esi # oldset
613 movq %rsp,%rdi # &pt_regs 638 movq %rsp,%rdi # &pt_regs
614 call do_notify_resume 639 call do_notify_resume
615 RESTORE_REST 640 RESTORE_REST
616 cli 641 DISABLE_INTERRUPTS(CLBR_NONE)
617 TRACE_IRQS_OFF 642 TRACE_IRQS_OFF
618 movl $_TIF_NEED_RESCHED,%edi 643 movl $_TIF_NEED_RESCHED,%edi
619 GET_THREAD_INFO(%rcx) 644 GET_THREAD_INFO(%rcx)
@@ -731,7 +756,7 @@ END(spurious_interrupt)
731 rdmsr 756 rdmsr
732 testl %edx,%edx 757 testl %edx,%edx
733 js 1f 758 js 1f
734 swapgs 759 SWAPGS
735 xorl %ebx,%ebx 760 xorl %ebx,%ebx
7361: 7611:
737 .if \ist 762 .if \ist
@@ -747,7 +772,7 @@ END(spurious_interrupt)
747 .if \ist 772 .if \ist
748 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 773 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
749 .endif 774 .endif
750 cli 775 DISABLE_INTERRUPTS(CLBR_NONE)
751 .if \irqtrace 776 .if \irqtrace
752 TRACE_IRQS_OFF 777 TRACE_IRQS_OFF
753 .endif 778 .endif
@@ -776,10 +801,10 @@ paranoid_swapgs\trace:
776 .if \trace 801 .if \trace
777 TRACE_IRQS_IRETQ 0 802 TRACE_IRQS_IRETQ 0
778 .endif 803 .endif
779 swapgs 804 SWAPGS_UNSAFE_STACK
780paranoid_restore\trace: 805paranoid_restore\trace:
781 RESTORE_ALL 8 806 RESTORE_ALL 8
782 iretq 807 INTERRUPT_RETURN
783paranoid_userspace\trace: 808paranoid_userspace\trace:
784 GET_THREAD_INFO(%rcx) 809 GET_THREAD_INFO(%rcx)
785 movl threadinfo_flags(%rcx),%ebx 810 movl threadinfo_flags(%rcx),%ebx
@@ -794,11 +819,11 @@ paranoid_userspace\trace:
794 .if \trace 819 .if \trace
795 TRACE_IRQS_ON 820 TRACE_IRQS_ON
796 .endif 821 .endif
797 sti 822 ENABLE_INTERRUPTS(CLBR_NONE)
798 xorl %esi,%esi /* arg2: oldset */ 823 xorl %esi,%esi /* arg2: oldset */
799 movq %rsp,%rdi /* arg1: &pt_regs */ 824 movq %rsp,%rdi /* arg1: &pt_regs */
800 call do_notify_resume 825 call do_notify_resume
801 cli 826 DISABLE_INTERRUPTS(CLBR_NONE)
802 .if \trace 827 .if \trace
803 TRACE_IRQS_OFF 828 TRACE_IRQS_OFF
804 .endif 829 .endif
@@ -807,9 +832,9 @@ paranoid_schedule\trace:
807 .if \trace 832 .if \trace
808 TRACE_IRQS_ON 833 TRACE_IRQS_ON
809 .endif 834 .endif
810 sti 835 ENABLE_INTERRUPTS(CLBR_ANY)
811 call schedule 836 call schedule
812 cli 837 DISABLE_INTERRUPTS(CLBR_ANY)
813 .if \trace 838 .if \trace
814 TRACE_IRQS_OFF 839 TRACE_IRQS_OFF
815 .endif 840 .endif
@@ -862,7 +887,7 @@ KPROBE_ENTRY(error_entry)
862 testl $3,CS(%rsp) 887 testl $3,CS(%rsp)
863 je error_kernelspace 888 je error_kernelspace
864error_swapgs: 889error_swapgs:
865 swapgs 890 SWAPGS
866error_sti: 891error_sti:
867 movq %rdi,RDI(%rsp) 892 movq %rdi,RDI(%rsp)
868 CFI_REL_OFFSET rdi,RDI 893 CFI_REL_OFFSET rdi,RDI
@@ -874,7 +899,7 @@ error_sti:
874error_exit: 899error_exit:
875 movl %ebx,%eax 900 movl %ebx,%eax
876 RESTORE_REST 901 RESTORE_REST
877 cli 902 DISABLE_INTERRUPTS(CLBR_NONE)
878 TRACE_IRQS_OFF 903 TRACE_IRQS_OFF
879 GET_THREAD_INFO(%rcx) 904 GET_THREAD_INFO(%rcx)
880 testl %eax,%eax 905 testl %eax,%eax
@@ -894,7 +919,7 @@ error_kernelspace:
894 iret run with kernel gs again, so don't set the user space flag. 919 iret run with kernel gs again, so don't set the user space flag.
895 B stepping K8s sometimes report an truncated RIP for IRET 920 B stepping K8s sometimes report an truncated RIP for IRET
896 exceptions returning to compat mode. Check for these here too. */ 921 exceptions returning to compat mode. Check for these here too. */
897 leaq iret_label(%rip),%rbp 922 leaq native_iret(%rip),%rbp
898 cmpq %rbp,RIP(%rsp) 923 cmpq %rbp,RIP(%rsp)
899 je error_swapgs 924 je error_swapgs
900 movl %ebp,%ebp /* zero extend */ 925 movl %ebp,%ebp /* zero extend */
@@ -911,12 +936,12 @@ ENTRY(load_gs_index)
911 CFI_STARTPROC 936 CFI_STARTPROC
912 pushf 937 pushf
913 CFI_ADJUST_CFA_OFFSET 8 938 CFI_ADJUST_CFA_OFFSET 8
914 cli 939 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
915 swapgs 940 SWAPGS
916gs_change: 941gs_change:
917 movl %edi,%gs 942 movl %edi,%gs
9182: mfence /* workaround */ 9432: mfence /* workaround */
919 swapgs 944 SWAPGS
920 popf 945 popf
921 CFI_ADJUST_CFA_OFFSET -8 946 CFI_ADJUST_CFA_OFFSET -8
922 ret 947 ret
@@ -930,7 +955,7 @@ ENDPROC(load_gs_index)
930 .section .fixup,"ax" 955 .section .fixup,"ax"
931 /* running with kernelgs */ 956 /* running with kernelgs */
932bad_gs: 957bad_gs:
933 swapgs /* switch back to user gs */ 958 SWAPGS /* switch back to user gs */
934 xorl %eax,%eax 959 xorl %eax,%eax
935 movl %eax,%gs 960 movl %eax,%gs
936 jmp 2b 961 jmp 2b
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index ce703e21c912..4ae7b6440260 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -24,18 +24,11 @@
24#include <acpi/acpi_bus.h> 24#include <acpi/acpi_bus.h>
25#endif 25#endif
26 26
27/* 27/* which logical CPU number maps to which CPU (physical APIC ID) */
28 * which logical CPU number maps to which CPU (physical APIC ID) 28u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
29 *
30 * The following static array is used during kernel startup
31 * and the x86_cpu_to_apicid_ptr contains the address of the
32 * array during this time. Is it zeroed when the per_cpu
33 * data area is removed.
34 */
35u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
36 = { [0 ... NR_CPUS-1] = BAD_APICID }; 29 = { [0 ... NR_CPUS-1] = BAD_APICID };
37void *x86_cpu_to_apicid_ptr; 30void *x86_cpu_to_apicid_early_ptr;
38DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; 31DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
39EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 32EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
40 33
41struct genapic __read_mostly *genapic = &apic_flat; 34struct genapic __read_mostly *genapic = &apic_flat;
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index f12d8c5d9809..9c7f7d395968 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * AMD Geode southbridge support code 2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc. 3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
4 * 5 *
5 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public License 7 * modify it under the terms of version 2 of the GNU General Public License
@@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base);
51 52
52/* === GPIO API === */ 53/* === GPIO API === */
53 54
54void geode_gpio_set(unsigned int gpio, unsigned int reg) 55void geode_gpio_set(u32 gpio, unsigned int reg)
55{ 56{
56 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
57 58
58 if (!base) 59 if (!base)
59 return; 60 return;
60 61
61 if (gpio < 16) 62 /* low bank register */
62 outl(1 << gpio, base + reg); 63 if (gpio & 0xFFFF)
63 else 64 outl(gpio & 0xFFFF, base + reg);
64 outl(1 << (gpio - 16), base + 0x80 + reg); 65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
65} 69}
66EXPORT_SYMBOL_GPL(geode_gpio_set); 70EXPORT_SYMBOL_GPL(geode_gpio_set);
67 71
68void geode_gpio_clear(unsigned int gpio, unsigned int reg) 72void geode_gpio_clear(u32 gpio, unsigned int reg)
69{ 73{
70 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
71 75
72 if (!base) 76 if (!base)
73 return; 77 return;
74 78
75 if (gpio < 16) 79 /* low bank register */
76 outl(1 << (gpio + 16), base + reg); 80 if (gpio & 0xFFFF)
77 else 81 outl((gpio & 0xFFFF) << 16, base + reg);
78 outl(1 << gpio, base + 0x80 + reg); 82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
79} 86}
80EXPORT_SYMBOL_GPL(geode_gpio_clear); 87EXPORT_SYMBOL_GPL(geode_gpio_clear);
81 88
82int geode_gpio_isset(unsigned int gpio, unsigned int reg) 89int geode_gpio_isset(u32 gpio, unsigned int reg)
83{ 90{
84 u32 base = geode_get_dev_base(GEODE_DEV_GPIO); 91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
85 93
86 if (!base) 94 if (!base)
87 return 0; 95 return 0;
88 96
89 if (gpio < 16) 97 /* low bank register */
90 return (inl(base + reg) & (1 << gpio)) ? 1 : 0; 98 if (gpio & 0xFFFF) {
91 else 99 val = inl(base + reg) & (gpio & 0xFFFF);
92 return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; 100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
93} 111}
94EXPORT_SYMBOL_GPL(geode_gpio_isset); 112EXPORT_SYMBOL_GPL(geode_gpio_isset);
95 113
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 6b3469311e42..24dbf56928d7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -10,6 +10,7 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/start_kernel.h>
13 14
14#include <asm/processor.h> 15#include <asm/processor.h>
15#include <asm/proto.h> 16#include <asm/proto.h>
@@ -19,12 +20,14 @@
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
23#include <asm/kdebug.h>
24#include <asm/e820.h>
22 25
23static void __init zap_identity_mappings(void) 26static void __init zap_identity_mappings(void)
24{ 27{
25 pgd_t *pgd = pgd_offset_k(0UL); 28 pgd_t *pgd = pgd_offset_k(0UL);
26 pgd_clear(pgd); 29 pgd_clear(pgd);
27 __flush_tlb(); 30 __flush_tlb_all();
28} 31}
29 32
30/* Don't add a printk in there. printk relies on the PDA which is not initialized 33/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -46,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data)
46 } 49 }
47} 50}
48 51
52#define EBDA_ADDR_POINTER 0x40E
53
54static __init void reserve_ebda(void)
55{
56 unsigned ebda_addr, ebda_size;
57
58 /*
59 * there is a real-mode segmented pointer pointing to the
60 * 4K EBDA area at 0x40E
61 */
62 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
63 ebda_addr <<= 4;
64
65 if (!ebda_addr)
66 return;
67
68 ebda_size = *(unsigned short *)__va(ebda_addr);
69
70 /* Round EBDA up to pages */
71 if (ebda_size == 0)
72 ebda_size = 1;
73 ebda_size <<= 10;
74 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
75 if (ebda_size > 64*1024)
76 ebda_size = 64*1024;
77
78 reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
79}
80
49void __init x86_64_start_kernel(char * real_mode_data) 81void __init x86_64_start_kernel(char * real_mode_data)
50{ 82{
51 int i; 83 int i;
@@ -56,8 +88,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
56 /* Make NULL pointers segfault */ 88 /* Make NULL pointers segfault */
57 zap_identity_mappings(); 89 zap_identity_mappings();
58 90
59 for (i = 0; i < IDT_ENTRIES; i++) 91 for (i = 0; i < IDT_ENTRIES; i++) {
92#ifdef CONFIG_EARLY_PRINTK
93 set_intr_gate(i, &early_idt_handlers[i]);
94#else
60 set_intr_gate(i, early_idt_handler); 95 set_intr_gate(i, early_idt_handler);
96#endif
97 }
61 load_idt((const struct desc_ptr *)&idt_descr); 98 load_idt((const struct desc_ptr *)&idt_descr);
62 99
63 early_printk("Kernel alive\n"); 100 early_printk("Kernel alive\n");
@@ -67,8 +104,24 @@ void __init x86_64_start_kernel(char * real_mode_data)
67 104
68 pda_init(0); 105 pda_init(0);
69 copy_bootdata(__va(real_mode_data)); 106 copy_bootdata(__va(real_mode_data));
70#ifdef CONFIG_SMP 107
71 cpu_set(0, cpu_online_map); 108 reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
72#endif 109
110 /* Reserve INITRD */
111 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
112 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
113 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
114 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
115 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
116 }
117
118 reserve_ebda();
119
120 /*
121 * At this point everything still needed from the boot loader
122 * or BIOS or kernel text should be early reserved or marked not
123 * RAM in e820. All other memory is free game.
124 */
125
73 start_kernel(); 126 start_kernel();
74} 127}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fbad51fce672..5d8c5730686b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -9,6 +9,7 @@
9 9
10.text 10.text
11#include <linux/threads.h> 11#include <linux/threads.h>
12#include <linux/init.h>
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <asm/segment.h> 14#include <asm/segment.h>
14#include <asm/page.h> 15#include <asm/page.h>
@@ -151,7 +152,9 @@ WEAK(xen_entry)
151 /* Unknown implementation; there's really 152 /* Unknown implementation; there's really
152 nothing we can do at this point. */ 153 nothing we can do at this point. */
153 ud2a 154 ud2a
154.data 155
156 __INITDATA
157
155subarch_entries: 158subarch_entries:
156 .long default_entry /* normal x86/PC */ 159 .long default_entry /* normal x86/PC */
157 .long lguest_entry /* lguest hypervisor */ 160 .long lguest_entry /* lguest hypervisor */
@@ -199,7 +202,6 @@ default_entry:
199 addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ 202 addl $0x67, %eax /* 0x67 == _PAGE_TABLE */
200 movl %eax, 4092(%edx) 203 movl %eax, 4092(%edx)
201 204
202 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
203 jmp 3f 205 jmp 3f
204/* 206/*
205 * Non-boot CPU entry point; entered from trampoline.S 207 * Non-boot CPU entry point; entered from trampoline.S
@@ -222,6 +224,8 @@ ENTRY(startup_32_smp)
222 movl %eax,%es 224 movl %eax,%es
223 movl %eax,%fs 225 movl %eax,%fs
224 movl %eax,%gs 226 movl %eax,%gs
227#endif /* CONFIG_SMP */
2283:
225 229
226/* 230/*
227 * New page tables may be in 4Mbyte page mode and may 231 * New page tables may be in 4Mbyte page mode and may
@@ -268,12 +272,6 @@ ENTRY(startup_32_smp)
268 wrmsr 272 wrmsr
269 273
2706: 2746:
271 /* This is a secondary processor (AP) */
272 xorl %ebx,%ebx
273 incl %ebx
274
275#endif /* CONFIG_SMP */
2763:
277 275
278/* 276/*
279 * Enable paging 277 * Enable paging
@@ -297,7 +295,7 @@ ENTRY(startup_32_smp)
297 popfl 295 popfl
298 296
299#ifdef CONFIG_SMP 297#ifdef CONFIG_SMP
300 andl %ebx,%ebx 298 cmpb $0, ready
301 jz 1f /* Initial CPU cleans BSS */ 299 jz 1f /* Initial CPU cleans BSS */
302 jmp checkCPUtype 300 jmp checkCPUtype
3031: 3011:
@@ -502,6 +500,7 @@ early_fault:
502 call printk 500 call printk
503#endif 501#endif
504#endif 502#endif
503 call dump_stack
505hlt_loop: 504hlt_loop:
506 hlt 505 hlt
507 jmp hlt_loop 506 jmp hlt_loop
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b6167fe3330e..09b38d539b09 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,13 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21 21
22#ifdef CONFIG_PARAVIRT
23#include <asm/asm-offsets.h>
24#include <asm/paravirt.h>
25#else
26#define GET_CR2_INTO_RCX movq %cr2, %rcx
27#endif
28
22/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 29/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
23 * because we need identity-mapped pages. 30 * because we need identity-mapped pages.
24 * 31 *
@@ -56,7 +63,7 @@ startup_64:
56 63
57 /* Is the address not 2M aligned? */ 64 /* Is the address not 2M aligned? */
58 movq %rbp, %rax 65 movq %rbp, %rax
59 andl $~LARGE_PAGE_MASK, %eax 66 andl $~PMD_PAGE_MASK, %eax
60 testl %eax, %eax 67 testl %eax, %eax
61 jnz bad_address 68 jnz bad_address
62 69
@@ -81,7 +88,7 @@ startup_64:
81 88
82 /* Add an Identity mapping if I am above 1G */ 89 /* Add an Identity mapping if I am above 1G */
83 leaq _text(%rip), %rdi 90 leaq _text(%rip), %rdi
84 andq $LARGE_PAGE_MASK, %rdi 91 andq $PMD_PAGE_MASK, %rdi
85 92
86 movq %rdi, %rax 93 movq %rdi, %rax
87 shrq $PUD_SHIFT, %rax 94 shrq $PUD_SHIFT, %rax
@@ -243,31 +250,55 @@ ENTRY(secondary_startup_64)
243 lretq 250 lretq
244 251
245 /* SMP bootup changes these two */ 252 /* SMP bootup changes these two */
246#ifndef CONFIG_HOTPLUG_CPU 253 __CPUINITDATA
247 .pushsection .init.data
248#endif
249 .align 8 254 .align 8
250 .globl initial_code 255 ENTRY(initial_code)
251initial_code:
252 .quad x86_64_start_kernel 256 .quad x86_64_start_kernel
253#ifndef CONFIG_HOTPLUG_CPU 257 __FINITDATA
254 .popsection 258
255#endif 259 ENTRY(init_rsp)
256 .globl init_rsp
257init_rsp:
258 .quad init_thread_union+THREAD_SIZE-8 260 .quad init_thread_union+THREAD_SIZE-8
259 261
260bad_address: 262bad_address:
261 jmp bad_address 263 jmp bad_address
262 264
265#ifdef CONFIG_EARLY_PRINTK
266.macro early_idt_tramp first, last
267 .ifgt \last-\first
268 early_idt_tramp \first, \last-1
269 .endif
270 movl $\last,%esi
271 jmp early_idt_handler
272.endm
273
274 .globl early_idt_handlers
275early_idt_handlers:
276 early_idt_tramp 0, 63
277 early_idt_tramp 64, 127
278 early_idt_tramp 128, 191
279 early_idt_tramp 192, 255
280#endif
281
263ENTRY(early_idt_handler) 282ENTRY(early_idt_handler)
283#ifdef CONFIG_EARLY_PRINTK
264 cmpl $2,early_recursion_flag(%rip) 284 cmpl $2,early_recursion_flag(%rip)
265 jz 1f 285 jz 1f
266 incl early_recursion_flag(%rip) 286 incl early_recursion_flag(%rip)
287 GET_CR2_INTO_RCX
288 movq %rcx,%r9
289 xorl %r8d,%r8d # zero for error code
290 movl %esi,%ecx # get vector number
291 # Test %ecx against mask of vectors that push error code.
292 cmpl $31,%ecx
293 ja 0f
294 movl $1,%eax
295 salq %cl,%rax
296 testl $0x27d00,%eax
297 je 0f
298 popq %r8 # get error code
2990: movq 0(%rsp),%rcx # get ip
300 movq 8(%rsp),%rdx # get cs
267 xorl %eax,%eax 301 xorl %eax,%eax
268 movq 8(%rsp),%rsi # get rip
269 movq (%rsp),%rdx
270 movq %cr2,%rcx
271 leaq early_idt_msg(%rip),%rdi 302 leaq early_idt_msg(%rip),%rdi
272 call early_printk 303 call early_printk
273 cmpl $2,early_recursion_flag(%rip) 304 cmpl $2,early_recursion_flag(%rip)
@@ -278,15 +309,19 @@ ENTRY(early_idt_handler)
278 movq 8(%rsp),%rsi # get rip again 309 movq 8(%rsp),%rsi # get rip again
279 call __print_symbol 310 call __print_symbol
280#endif 311#endif
312#endif /* EARLY_PRINTK */
2811: hlt 3131: hlt
282 jmp 1b 314 jmp 1b
315
316#ifdef CONFIG_EARLY_PRINTK
283early_recursion_flag: 317early_recursion_flag:
284 .long 0 318 .long 0
285 319
286early_idt_msg: 320early_idt_msg:
287 .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" 321 .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
288early_idt_ripmsg: 322early_idt_ripmsg:
289 .asciz "RIP %s\n" 323 .asciz "RIP %s\n"
324#endif /* CONFIG_EARLY_PRINTK */
290 325
291.balign PAGE_SIZE 326.balign PAGE_SIZE
292 327
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 4a86ffd67ec5..429d084e014d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -6,7 +6,6 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/sysdev.h> 7#include <linux/sysdev.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/delay.h>
10 9
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/hpet.h> 11#include <asm/hpet.h>
@@ -16,7 +15,8 @@
16#define HPET_MASK CLOCKSOURCE_MASK(32) 15#define HPET_MASK CLOCKSOURCE_MASK(32)
17#define HPET_SHIFT 22 16#define HPET_SHIFT 22
18 17
19/* FSEC = 10^-15 NSEC = 10^-9 */ 18/* FSEC = 10^-15
19 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000 20#define FSEC_PER_NSEC 1000000
21 21
22/* 22/*
@@ -107,6 +107,7 @@ int is_hpet_enabled(void)
107{ 107{
108 return is_hpet_capable() && hpet_legacy_int_enabled; 108 return is_hpet_capable() && hpet_legacy_int_enabled;
109} 109}
110EXPORT_SYMBOL_GPL(is_hpet_enabled);
110 111
111/* 112/*
112 * When the hpet driver (/dev/hpet) is enabled, we need to reserve 113 * When the hpet driver (/dev/hpet) is enabled, we need to reserve
@@ -132,16 +133,13 @@ static void hpet_reserve_platform_timers(unsigned long id)
132#ifdef CONFIG_HPET_EMULATE_RTC 133#ifdef CONFIG_HPET_EMULATE_RTC
133 hpet_reserve_timer(&hd, 1); 134 hpet_reserve_timer(&hd, 1);
134#endif 135#endif
135
136 hd.hd_irq[0] = HPET_LEGACY_8254; 136 hd.hd_irq[0] = HPET_LEGACY_8254;
137 hd.hd_irq[1] = HPET_LEGACY_RTC; 137 hd.hd_irq[1] = HPET_LEGACY_RTC;
138 138
139 for (i = 2; i < nrtimers; timer++, i++) 139 for (i = 2; i < nrtimers; timer++, i++)
140 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> 140 hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >>
141 Tn_INT_ROUTE_CNF_SHIFT; 141 Tn_INT_ROUTE_CNF_SHIFT;
142
143 hpet_alloc(&hd); 142 hpet_alloc(&hd);
144
145} 143}
146#else 144#else
147static void hpet_reserve_platform_timers(unsigned long id) { } 145static void hpet_reserve_platform_timers(unsigned long id) { }
@@ -478,6 +476,7 @@ void hpet_disable(void)
478 */ 476 */
479#include <linux/mc146818rtc.h> 477#include <linux/mc146818rtc.h>
480#include <linux/rtc.h> 478#include <linux/rtc.h>
479#include <asm/rtc.h>
481 480
482#define DEFAULT_RTC_INT_FREQ 64 481#define DEFAULT_RTC_INT_FREQ 64
483#define DEFAULT_RTC_SHIFT 6 482#define DEFAULT_RTC_SHIFT 6
@@ -492,6 +491,38 @@ static unsigned long hpet_default_delta;
492static unsigned long hpet_pie_delta; 491static unsigned long hpet_pie_delta;
493static unsigned long hpet_pie_limit; 492static unsigned long hpet_pie_limit;
494 493
494static rtc_irq_handler irq_handler;
495
496/*
497 * Registers a IRQ handler.
498 */
499int hpet_register_irq_handler(rtc_irq_handler handler)
500{
501 if (!is_hpet_enabled())
502 return -ENODEV;
503 if (irq_handler)
504 return -EBUSY;
505
506 irq_handler = handler;
507
508 return 0;
509}
510EXPORT_SYMBOL_GPL(hpet_register_irq_handler);
511
512/*
513 * Deregisters the IRQ handler registered with hpet_register_irq_handler()
514 * and does cleanup.
515 */
516void hpet_unregister_irq_handler(rtc_irq_handler handler)
517{
518 if (!is_hpet_enabled())
519 return;
520
521 irq_handler = NULL;
522 hpet_rtc_flags = 0;
523}
524EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
525
495/* 526/*
496 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode 527 * Timer 1 for RTC emulation. We use one shot mode, as periodic mode
497 * is not supported by all HPET implementations for timer 1. 528 * is not supported by all HPET implementations for timer 1.
@@ -533,6 +564,7 @@ int hpet_rtc_timer_init(void)
533 564
534 return 1; 565 return 1;
535} 566}
567EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
536 568
537/* 569/*
538 * The functions below are called from rtc driver. 570 * The functions below are called from rtc driver.
@@ -547,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
547 hpet_rtc_flags &= ~bit_mask; 579 hpet_rtc_flags &= ~bit_mask;
548 return 1; 580 return 1;
549} 581}
582EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
550 583
551int hpet_set_rtc_irq_bit(unsigned long bit_mask) 584int hpet_set_rtc_irq_bit(unsigned long bit_mask)
552{ 585{
@@ -562,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
562 595
563 return 1; 596 return 1;
564} 597}
598EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit);
565 599
566int hpet_set_alarm_time(unsigned char hrs, unsigned char min, 600int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
567 unsigned char sec) 601 unsigned char sec)
@@ -575,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
575 609
576 return 1; 610 return 1;
577} 611}
612EXPORT_SYMBOL_GPL(hpet_set_alarm_time);
578 613
579int hpet_set_periodic_freq(unsigned long freq) 614int hpet_set_periodic_freq(unsigned long freq)
580{ 615{
@@ -593,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq)
593 } 628 }
594 return 1; 629 return 1;
595} 630}
631EXPORT_SYMBOL_GPL(hpet_set_periodic_freq);
596 632
597int hpet_rtc_dropped_irq(void) 633int hpet_rtc_dropped_irq(void)
598{ 634{
599 return is_hpet_enabled(); 635 return is_hpet_enabled();
600} 636}
637EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
601 638
602static void hpet_rtc_timer_reinit(void) 639static void hpet_rtc_timer_reinit(void)
603{ 640{
@@ -641,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
641 unsigned long rtc_int_flag = 0; 678 unsigned long rtc_int_flag = 0;
642 679
643 hpet_rtc_timer_reinit(); 680 hpet_rtc_timer_reinit();
681 memset(&curr_time, 0, sizeof(struct rtc_time));
644 682
645 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) 683 if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
646 rtc_get_rtc_time(&curr_time); 684 get_rtc_time(&curr_time);
647 685
648 if (hpet_rtc_flags & RTC_UIE && 686 if (hpet_rtc_flags & RTC_UIE &&
649 curr_time.tm_sec != hpet_prev_update_sec) { 687 curr_time.tm_sec != hpet_prev_update_sec) {
@@ -657,7 +695,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
657 hpet_pie_count = 0; 695 hpet_pie_count = 0;
658 } 696 }
659 697
660 if (hpet_rtc_flags & RTC_PIE && 698 if (hpet_rtc_flags & RTC_AIE &&
661 (curr_time.tm_sec == hpet_alarm_time.tm_sec) && 699 (curr_time.tm_sec == hpet_alarm_time.tm_sec) &&
662 (curr_time.tm_min == hpet_alarm_time.tm_min) && 700 (curr_time.tm_min == hpet_alarm_time.tm_min) &&
663 (curr_time.tm_hour == hpet_alarm_time.tm_hour)) 701 (curr_time.tm_hour == hpet_alarm_time.tm_hour))
@@ -665,8 +703,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
665 703
666 if (rtc_int_flag) { 704 if (rtc_int_flag) {
667 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); 705 rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8));
668 rtc_interrupt(rtc_int_flag, dev_id); 706 if (irq_handler)
707 irq_handler(rtc_int_flag, dev_id);
669 } 708 }
670 return IRQ_HANDLED; 709 return IRQ_HANDLED;
671} 710}
711EXPORT_SYMBOL_GPL(hpet_rtc_interrupt);
672#endif 712#endif
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 02112fcc0de7..061627806a2d 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8);
22 22
23EXPORT_SYMBOL(strstr); 23EXPORT_SYMBOL(strstr);
24 24
25#ifdef CONFIG_SMP
26extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
27extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
28EXPORT_SYMBOL(__write_lock_failed);
29EXPORT_SYMBOL(__read_lock_failed);
30#endif
31
32EXPORT_SYMBOL(csum_partial); 25EXPORT_SYMBOL(csum_partial);
33EXPORT_SYMBOL(empty_zero_page); 26EXPORT_SYMBOL(empty_zero_page);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
new file mode 100644
index 000000000000..26719bd2c77c
--- /dev/null
+++ b/arch/x86/kernel/i387.c
@@ -0,0 +1,479 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9#include <linux/sched.h>
10#include <linux/module.h>
11#include <linux/regset.h>
12#include <asm/processor.h>
13#include <asm/i387.h>
14#include <asm/math_emu.h>
15#include <asm/sigcontext.h>
16#include <asm/user.h>
17#include <asm/ptrace.h>
18#include <asm/uaccess.h>
19
20#ifdef CONFIG_X86_64
21
22#include <asm/sigcontext32.h>
23#include <asm/user32.h>
24
25#else
26
27#define save_i387_ia32 save_i387
28#define restore_i387_ia32 restore_i387
29
30#define _fpstate_ia32 _fpstate
31#define user_i387_ia32_struct user_i387_struct
32#define user32_fxsr_struct user_fxsr_struct
33
34#endif
35
36#ifdef CONFIG_MATH_EMULATION
37#define HAVE_HWFP (boot_cpu_data.hard_math)
38#else
39#define HAVE_HWFP 1
40#endif
41
42unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
43
44void mxcsr_feature_mask_init(void)
45{
46 unsigned long mask = 0;
47 clts();
48 if (cpu_has_fxsr) {
49 memset(&current->thread.i387.fxsave, 0,
50 sizeof(struct i387_fxsave_struct));
51 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
52 mask = current->thread.i387.fxsave.mxcsr_mask;
53 if (mask == 0)
54 mask = 0x0000ffbf;
55 }
56 mxcsr_feature_mask &= mask;
57 stts();
58}
59
60#ifdef CONFIG_X86_64
61/*
62 * Called at bootup to set up the initial FPU state that is later cloned
63 * into all processes.
64 */
65void __cpuinit fpu_init(void)
66{
67 unsigned long oldcr0 = read_cr0();
68 extern void __bad_fxsave_alignment(void);
69
70 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
71 __bad_fxsave_alignment();
72 set_in_cr4(X86_CR4_OSFXSR);
73 set_in_cr4(X86_CR4_OSXMMEXCPT);
74
75 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
76
77 mxcsr_feature_mask_init();
78 /* clean state in init */
79 current_thread_info()->status = 0;
80 clear_used_math();
81}
82#endif /* CONFIG_X86_64 */
83
84/*
85 * The _current_ task is using the FPU for the first time
86 * so initialize it and set the mxcsr to its default
87 * value at reset if we support XMM instructions and then
88 * remeber the current task has used the FPU.
89 */
90void init_fpu(struct task_struct *tsk)
91{
92 if (tsk_used_math(tsk)) {
93 if (tsk == current)
94 unlazy_fpu(tsk);
95 return;
96 }
97
98 if (cpu_has_fxsr) {
99 memset(&tsk->thread.i387.fxsave, 0,
100 sizeof(struct i387_fxsave_struct));
101 tsk->thread.i387.fxsave.cwd = 0x37f;
102 if (cpu_has_xmm)
103 tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
104 } else {
105 memset(&tsk->thread.i387.fsave, 0,
106 sizeof(struct i387_fsave_struct));
107 tsk->thread.i387.fsave.cwd = 0xffff037fu;
108 tsk->thread.i387.fsave.swd = 0xffff0000u;
109 tsk->thread.i387.fsave.twd = 0xffffffffu;
110 tsk->thread.i387.fsave.fos = 0xffff0000u;
111 }
112 /*
113 * Only the device not available exception or ptrace can call init_fpu.
114 */
115 set_stopped_child_used_math(tsk);
116}
117
118int fpregs_active(struct task_struct *target, const struct user_regset *regset)
119{
120 return tsk_used_math(target) ? regset->n : 0;
121}
122
123int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
124{
125 return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
126}
127
128int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
129 unsigned int pos, unsigned int count,
130 void *kbuf, void __user *ubuf)
131{
132 if (!cpu_has_fxsr)
133 return -ENODEV;
134
135 unlazy_fpu(target);
136
137 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
138 &target->thread.i387.fxsave, 0, -1);
139}
140
141int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
142 unsigned int pos, unsigned int count,
143 const void *kbuf, const void __user *ubuf)
144{
145 int ret;
146
147 if (!cpu_has_fxsr)
148 return -ENODEV;
149
150 unlazy_fpu(target);
151 set_stopped_child_used_math(target);
152
153 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
154 &target->thread.i387.fxsave, 0, -1);
155
156 /*
157 * mxcsr reserved bits must be masked to zero for security reasons.
158 */
159 target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
160
161 return ret;
162}
163
164#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
165
166/*
167 * FPU tag word conversions.
168 */
169
170static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
171{
172 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
173
174 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
175 tmp = ~twd;
176 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
177 /* and move the valid bits to the lower byte. */
178 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
179 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
180 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
181 return tmp;
182}
183
184#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
185#define FP_EXP_TAG_VALID 0
186#define FP_EXP_TAG_ZERO 1
187#define FP_EXP_TAG_SPECIAL 2
188#define FP_EXP_TAG_EMPTY 3
189
190static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
191{
192 struct _fpxreg *st;
193 u32 tos = (fxsave->swd >> 11) & 7;
194 u32 twd = (unsigned long) fxsave->twd;
195 u32 tag;
196 u32 ret = 0xffff0000u;
197 int i;
198
199 for (i = 0; i < 8; i++, twd >>= 1) {
200 if (twd & 0x1) {
201 st = FPREG_ADDR(fxsave, (i - tos) & 7);
202
203 switch (st->exponent & 0x7fff) {
204 case 0x7fff:
205 tag = FP_EXP_TAG_SPECIAL;
206 break;
207 case 0x0000:
208 if (!st->significand[0] &&
209 !st->significand[1] &&
210 !st->significand[2] &&
211 !st->significand[3])
212 tag = FP_EXP_TAG_ZERO;
213 else
214 tag = FP_EXP_TAG_SPECIAL;
215 break;
216 default:
217 if (st->significand[3] & 0x8000)
218 tag = FP_EXP_TAG_VALID;
219 else
220 tag = FP_EXP_TAG_SPECIAL;
221 break;
222 }
223 } else {
224 tag = FP_EXP_TAG_EMPTY;
225 }
226 ret |= tag << (2 * i);
227 }
228 return ret;
229}
230
231/*
232 * FXSR floating point environment conversions.
233 */
234
235static void convert_from_fxsr(struct user_i387_ia32_struct *env,
236 struct task_struct *tsk)
237{
238 struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
239 struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
240 struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
241 int i;
242
243 env->cwd = fxsave->cwd | 0xffff0000u;
244 env->swd = fxsave->swd | 0xffff0000u;
245 env->twd = twd_fxsr_to_i387(fxsave);
246
247#ifdef CONFIG_X86_64
248 env->fip = fxsave->rip;
249 env->foo = fxsave->rdp;
250 if (tsk == current) {
251 /*
252 * should be actually ds/cs at fpu exception time, but
253 * that information is not available in 64bit mode.
254 */
255 asm("mov %%ds,%0" : "=r" (env->fos));
256 asm("mov %%cs,%0" : "=r" (env->fcs));
257 } else {
258 struct pt_regs *regs = task_pt_regs(tsk);
259 env->fos = 0xffff0000 | tsk->thread.ds;
260 env->fcs = regs->cs;
261 }
262#else
263 env->fip = fxsave->fip;
264 env->fcs = fxsave->fcs;
265 env->foo = fxsave->foo;
266 env->fos = fxsave->fos;
267#endif
268
269 for (i = 0; i < 8; ++i)
270 memcpy(&to[i], &from[i], sizeof(to[0]));
271}
272
273static void convert_to_fxsr(struct task_struct *tsk,
274 const struct user_i387_ia32_struct *env)
275
276{
277 struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
278 struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
279 struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
280 int i;
281
282 fxsave->cwd = env->cwd;
283 fxsave->swd = env->swd;
284 fxsave->twd = twd_i387_to_fxsr(env->twd);
285 fxsave->fop = (u16) ((u32) env->fcs >> 16);
286#ifdef CONFIG_X86_64
287 fxsave->rip = env->fip;
288 fxsave->rdp = env->foo;
289 /* cs and ds ignored */
290#else
291 fxsave->fip = env->fip;
292 fxsave->fcs = (env->fcs & 0xffff);
293 fxsave->foo = env->foo;
294 fxsave->fos = env->fos;
295#endif
296
297 for (i = 0; i < 8; ++i)
298 memcpy(&to[i], &from[i], sizeof(from[0]));
299}
300
301int fpregs_get(struct task_struct *target, const struct user_regset *regset,
302 unsigned int pos, unsigned int count,
303 void *kbuf, void __user *ubuf)
304{
305 struct user_i387_ia32_struct env;
306
307 if (!HAVE_HWFP)
308 return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
309
310 unlazy_fpu(target);
311
312 if (!cpu_has_fxsr)
313 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
314 &target->thread.i387.fsave, 0, -1);
315
316 if (kbuf && pos == 0 && count == sizeof(env)) {
317 convert_from_fxsr(kbuf, target);
318 return 0;
319 }
320
321 convert_from_fxsr(&env, target);
322 return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
323}
324
325int fpregs_set(struct task_struct *target, const struct user_regset *regset,
326 unsigned int pos, unsigned int count,
327 const void *kbuf, const void __user *ubuf)
328{
329 struct user_i387_ia32_struct env;
330 int ret;
331
332 if (!HAVE_HWFP)
333 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
334
335 unlazy_fpu(target);
336 set_stopped_child_used_math(target);
337
338 if (!cpu_has_fxsr)
339 return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
340 &target->thread.i387.fsave, 0, -1);
341
342 if (pos > 0 || count < sizeof(env))
343 convert_from_fxsr(&env, target);
344
345 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
346 if (!ret)
347 convert_to_fxsr(target, &env);
348
349 return ret;
350}
351
352/*
353 * Signal frame handlers.
354 */
355
356static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
357{
358 struct task_struct *tsk = current;
359
360 unlazy_fpu(tsk);
361 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
362 if (__copy_to_user(buf, &tsk->thread.i387.fsave,
363 sizeof(struct i387_fsave_struct)))
364 return -1;
365 return 1;
366}
367
368static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
369{
370 struct task_struct *tsk = current;
371 struct user_i387_ia32_struct env;
372 int err = 0;
373
374 unlazy_fpu(tsk);
375
376 convert_from_fxsr(&env, tsk);
377 if (__copy_to_user(buf, &env, sizeof(env)))
378 return -1;
379
380 err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
381 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
382 if (err)
383 return -1;
384
385 if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
386 sizeof(struct i387_fxsave_struct)))
387 return -1;
388 return 1;
389}
390
391int save_i387_ia32(struct _fpstate_ia32 __user *buf)
392{
393 if (!used_math())
394 return 0;
395
396 /* This will cause a "finit" to be triggered by the next
397 * attempted FPU operation by the 'current' process.
398 */
399 clear_used_math();
400
401 if (HAVE_HWFP) {
402 if (cpu_has_fxsr) {
403 return save_i387_fxsave(buf);
404 } else {
405 return save_i387_fsave(buf);
406 }
407 } else {
408 return fpregs_soft_get(current, NULL,
409 0, sizeof(struct user_i387_ia32_struct),
410 NULL, buf) ? -1 : 1;
411 }
412}
413
414static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
415{
416 struct task_struct *tsk = current;
417 clear_fpu(tsk);
418 return __copy_from_user(&tsk->thread.i387.fsave, buf,
419 sizeof(struct i387_fsave_struct));
420}
421
422static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
423{
424 int err;
425 struct task_struct *tsk = current;
426 struct user_i387_ia32_struct env;
427 clear_fpu(tsk);
428 err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
429 sizeof(struct i387_fxsave_struct));
430 /* mxcsr reserved bits must be masked to zero for security reasons */
431 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
432 if (err || __copy_from_user(&env, buf, sizeof(env)))
433 return 1;
434 convert_to_fxsr(tsk, &env);
435 return 0;
436}
437
438int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
439{
440 int err;
441
442 if (HAVE_HWFP) {
443 if (cpu_has_fxsr) {
444 err = restore_i387_fxsave(buf);
445 } else {
446 err = restore_i387_fsave(buf);
447 }
448 } else {
449 err = fpregs_soft_set(current, NULL,
450 0, sizeof(struct user_i387_ia32_struct),
451 NULL, buf) != 0;
452 }
453 set_used_math();
454 return err;
455}
456
457/*
458 * FPU state for core dumps.
459 * This is only used for a.out dumps now.
460 * It is declared generically using elf_fpregset_t (which is
461 * struct user_i387_struct) but is in fact only used for 32-bit
462 * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
463 */
464int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
465{
466 int fpvalid;
467 struct task_struct *tsk = current;
468
469 fpvalid = !!used_math();
470 if (fpvalid)
471 fpvalid = !fpregs_get(tsk, NULL,
472 0, sizeof(struct user_i387_ia32_struct),
473 fpu, NULL);
474
475 return fpvalid;
476}
477EXPORT_SYMBOL(dump_fpu);
478
479#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c
deleted file mode 100644
index 7d2e12f6c78b..000000000000
--- a/arch/x86/kernel/i387_32.c
+++ /dev/null
@@ -1,544 +0,0 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9#include <linux/sched.h>
10#include <linux/module.h>
11#include <asm/processor.h>
12#include <asm/i387.h>
13#include <asm/math_emu.h>
14#include <asm/sigcontext.h>
15#include <asm/user.h>
16#include <asm/ptrace.h>
17#include <asm/uaccess.h>
18
19#ifdef CONFIG_MATH_EMULATION
20#define HAVE_HWFP (boot_cpu_data.hard_math)
21#else
22#define HAVE_HWFP 1
23#endif
24
25static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff;
26
27void mxcsr_feature_mask_init(void)
28{
29 unsigned long mask = 0;
30 clts();
31 if (cpu_has_fxsr) {
32 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
33 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
34 mask = current->thread.i387.fxsave.mxcsr_mask;
35 if (mask == 0) mask = 0x0000ffbf;
36 }
37 mxcsr_feature_mask &= mask;
38 stts();
39}
40
41/*
42 * The _current_ task is using the FPU for the first time
43 * so initialize it and set the mxcsr to its default
44 * value at reset if we support XMM instructions and then
45 * remeber the current task has used the FPU.
46 */
47void init_fpu(struct task_struct *tsk)
48{
49 if (cpu_has_fxsr) {
50 memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
51 tsk->thread.i387.fxsave.cwd = 0x37f;
52 if (cpu_has_xmm)
53 tsk->thread.i387.fxsave.mxcsr = 0x1f80;
54 } else {
55 memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct));
56 tsk->thread.i387.fsave.cwd = 0xffff037fu;
57 tsk->thread.i387.fsave.swd = 0xffff0000u;
58 tsk->thread.i387.fsave.twd = 0xffffffffu;
59 tsk->thread.i387.fsave.fos = 0xffff0000u;
60 }
61 /* only the device not available exception or ptrace can call init_fpu */
62 set_stopped_child_used_math(tsk);
63}
64
65/*
66 * FPU lazy state save handling.
67 */
68
69void kernel_fpu_begin(void)
70{
71 struct thread_info *thread = current_thread_info();
72
73 preempt_disable();
74 if (thread->status & TS_USEDFPU) {
75 __save_init_fpu(thread->task);
76 return;
77 }
78 clts();
79}
80EXPORT_SYMBOL_GPL(kernel_fpu_begin);
81
82/*
83 * FPU tag word conversions.
84 */
85
86static inline unsigned short twd_i387_to_fxsr( unsigned short twd )
87{
88 unsigned int tmp; /* to avoid 16 bit prefixes in the code */
89
90 /* Transform each pair of bits into 01 (valid) or 00 (empty) */
91 tmp = ~twd;
92 tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
93 /* and move the valid bits to the lower byte. */
94 tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
95 tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
96 tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
97 return tmp;
98}
99
100static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave )
101{
102 struct _fpxreg *st = NULL;
103 unsigned long tos = (fxsave->swd >> 11) & 7;
104 unsigned long twd = (unsigned long) fxsave->twd;
105 unsigned long tag;
106 unsigned long ret = 0xffff0000u;
107 int i;
108
109#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
110
111 for ( i = 0 ; i < 8 ; i++ ) {
112 if ( twd & 0x1 ) {
113 st = FPREG_ADDR( fxsave, (i - tos) & 7 );
114
115 switch ( st->exponent & 0x7fff ) {
116 case 0x7fff:
117 tag = 2; /* Special */
118 break;
119 case 0x0000:
120 if ( !st->significand[0] &&
121 !st->significand[1] &&
122 !st->significand[2] &&
123 !st->significand[3] ) {
124 tag = 1; /* Zero */
125 } else {
126 tag = 2; /* Special */
127 }
128 break;
129 default:
130 if ( st->significand[3] & 0x8000 ) {
131 tag = 0; /* Valid */
132 } else {
133 tag = 2; /* Special */
134 }
135 break;
136 }
137 } else {
138 tag = 3; /* Empty */
139 }
140 ret |= (tag << (2 * i));
141 twd = twd >> 1;
142 }
143 return ret;
144}
145
146/*
147 * FPU state interaction.
148 */
149
150unsigned short get_fpu_cwd( struct task_struct *tsk )
151{
152 if ( cpu_has_fxsr ) {
153 return tsk->thread.i387.fxsave.cwd;
154 } else {
155 return (unsigned short)tsk->thread.i387.fsave.cwd;
156 }
157}
158
159unsigned short get_fpu_swd( struct task_struct *tsk )
160{
161 if ( cpu_has_fxsr ) {
162 return tsk->thread.i387.fxsave.swd;
163 } else {
164 return (unsigned short)tsk->thread.i387.fsave.swd;
165 }
166}
167
168#if 0
169unsigned short get_fpu_twd( struct task_struct *tsk )
170{
171 if ( cpu_has_fxsr ) {
172 return tsk->thread.i387.fxsave.twd;
173 } else {
174 return (unsigned short)tsk->thread.i387.fsave.twd;
175 }
176}
177#endif /* 0 */
178
179unsigned short get_fpu_mxcsr( struct task_struct *tsk )
180{
181 if ( cpu_has_xmm ) {
182 return tsk->thread.i387.fxsave.mxcsr;
183 } else {
184 return 0x1f80;
185 }
186}
187
188#if 0
189
190void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd )
191{
192 if ( cpu_has_fxsr ) {
193 tsk->thread.i387.fxsave.cwd = cwd;
194 } else {
195 tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u);
196 }
197}
198
199void set_fpu_swd( struct task_struct *tsk, unsigned short swd )
200{
201 if ( cpu_has_fxsr ) {
202 tsk->thread.i387.fxsave.swd = swd;
203 } else {
204 tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u);
205 }
206}
207
208void set_fpu_twd( struct task_struct *tsk, unsigned short twd )
209{
210 if ( cpu_has_fxsr ) {
211 tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd);
212 } else {
213 tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u);
214 }
215}
216
217#endif /* 0 */
218
219/*
220 * FXSR floating point environment conversions.
221 */
222
223static int convert_fxsr_to_user( struct _fpstate __user *buf,
224 struct i387_fxsave_struct *fxsave )
225{
226 unsigned long env[7];
227 struct _fpreg __user *to;
228 struct _fpxreg *from;
229 int i;
230
231 env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul;
232 env[1] = (unsigned long)fxsave->swd | 0xffff0000ul;
233 env[2] = twd_fxsr_to_i387(fxsave);
234 env[3] = fxsave->fip;
235 env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16);
236 env[5] = fxsave->foo;
237 env[6] = fxsave->fos;
238
239 if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) )
240 return 1;
241
242 to = &buf->_st[0];
243 from = (struct _fpxreg *) &fxsave->st_space[0];
244 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
245 unsigned long __user *t = (unsigned long __user *)to;
246 unsigned long *f = (unsigned long *)from;
247
248 if (__put_user(*f, t) ||
249 __put_user(*(f + 1), t + 1) ||
250 __put_user(from->exponent, &to->exponent))
251 return 1;
252 }
253 return 0;
254}
255
256static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave,
257 struct _fpstate __user *buf )
258{
259 unsigned long env[7];
260 struct _fpxreg *to;
261 struct _fpreg __user *from;
262 int i;
263
264 if ( __copy_from_user( env, buf, 7 * sizeof(long) ) )
265 return 1;
266
267 fxsave->cwd = (unsigned short)(env[0] & 0xffff);
268 fxsave->swd = (unsigned short)(env[1] & 0xffff);
269 fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff));
270 fxsave->fip = env[3];
271 fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16);
272 fxsave->fcs = (env[4] & 0xffff);
273 fxsave->foo = env[5];
274 fxsave->fos = env[6];
275
276 to = (struct _fpxreg *) &fxsave->st_space[0];
277 from = &buf->_st[0];
278 for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
279 unsigned long *t = (unsigned long *)to;
280 unsigned long __user *f = (unsigned long __user *)from;
281
282 if (__get_user(*t, f) ||
283 __get_user(*(t + 1), f + 1) ||
284 __get_user(to->exponent, &from->exponent))
285 return 1;
286 }
287 return 0;
288}
289
290/*
291 * Signal frame handlers.
292 */
293
294static inline int save_i387_fsave( struct _fpstate __user *buf )
295{
296 struct task_struct *tsk = current;
297
298 unlazy_fpu( tsk );
299 tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
300 if ( __copy_to_user( buf, &tsk->thread.i387.fsave,
301 sizeof(struct i387_fsave_struct) ) )
302 return -1;
303 return 1;
304}
305
306static int save_i387_fxsave( struct _fpstate __user *buf )
307{
308 struct task_struct *tsk = current;
309 int err = 0;
310
311 unlazy_fpu( tsk );
312
313 if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) )
314 return -1;
315
316 err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status );
317 err |= __put_user( X86_FXSR_MAGIC, &buf->magic );
318 if ( err )
319 return -1;
320
321 if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
322 sizeof(struct i387_fxsave_struct) ) )
323 return -1;
324 return 1;
325}
326
327int save_i387( struct _fpstate __user *buf )
328{
329 if ( !used_math() )
330 return 0;
331
332 /* This will cause a "finit" to be triggered by the next
333 * attempted FPU operation by the 'current' process.
334 */
335 clear_used_math();
336
337 if ( HAVE_HWFP ) {
338 if ( cpu_has_fxsr ) {
339 return save_i387_fxsave( buf );
340 } else {
341 return save_i387_fsave( buf );
342 }
343 } else {
344 return save_i387_soft( &current->thread.i387.soft, buf );
345 }
346}
347
348static inline int restore_i387_fsave( struct _fpstate __user *buf )
349{
350 struct task_struct *tsk = current;
351 clear_fpu( tsk );
352 return __copy_from_user( &tsk->thread.i387.fsave, buf,
353 sizeof(struct i387_fsave_struct) );
354}
355
356static int restore_i387_fxsave( struct _fpstate __user *buf )
357{
358 int err;
359 struct task_struct *tsk = current;
360 clear_fpu( tsk );
361 err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
362 sizeof(struct i387_fxsave_struct) );
363 /* mxcsr reserved bits must be masked to zero for security reasons */
364 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
365 return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf );
366}
367
368int restore_i387( struct _fpstate __user *buf )
369{
370 int err;
371
372 if ( HAVE_HWFP ) {
373 if ( cpu_has_fxsr ) {
374 err = restore_i387_fxsave( buf );
375 } else {
376 err = restore_i387_fsave( buf );
377 }
378 } else {
379 err = restore_i387_soft( &current->thread.i387.soft, buf );
380 }
381 set_used_math();
382 return err;
383}
384
385/*
386 * ptrace request handlers.
387 */
388
389static inline int get_fpregs_fsave( struct user_i387_struct __user *buf,
390 struct task_struct *tsk )
391{
392 return __copy_to_user( buf, &tsk->thread.i387.fsave,
393 sizeof(struct user_i387_struct) );
394}
395
396static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf,
397 struct task_struct *tsk )
398{
399 return convert_fxsr_to_user( (struct _fpstate __user *)buf,
400 &tsk->thread.i387.fxsave );
401}
402
403int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk )
404{
405 if ( HAVE_HWFP ) {
406 if ( cpu_has_fxsr ) {
407 return get_fpregs_fxsave( buf, tsk );
408 } else {
409 return get_fpregs_fsave( buf, tsk );
410 }
411 } else {
412 return save_i387_soft( &tsk->thread.i387.soft,
413 (struct _fpstate __user *)buf );
414 }
415}
416
417static inline int set_fpregs_fsave( struct task_struct *tsk,
418 struct user_i387_struct __user *buf )
419{
420 return __copy_from_user( &tsk->thread.i387.fsave, buf,
421 sizeof(struct user_i387_struct) );
422}
423
424static inline int set_fpregs_fxsave( struct task_struct *tsk,
425 struct user_i387_struct __user *buf )
426{
427 return convert_fxsr_from_user( &tsk->thread.i387.fxsave,
428 (struct _fpstate __user *)buf );
429}
430
431int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf )
432{
433 if ( HAVE_HWFP ) {
434 if ( cpu_has_fxsr ) {
435 return set_fpregs_fxsave( tsk, buf );
436 } else {
437 return set_fpregs_fsave( tsk, buf );
438 }
439 } else {
440 return restore_i387_soft( &tsk->thread.i387.soft,
441 (struct _fpstate __user *)buf );
442 }
443}
444
445int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk )
446{
447 if ( cpu_has_fxsr ) {
448 if (__copy_to_user( buf, &tsk->thread.i387.fxsave,
449 sizeof(struct user_fxsr_struct) ))
450 return -EFAULT;
451 return 0;
452 } else {
453 return -EIO;
454 }
455}
456
457int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf )
458{
459 int ret = 0;
460
461 if ( cpu_has_fxsr ) {
462 if (__copy_from_user( &tsk->thread.i387.fxsave, buf,
463 sizeof(struct user_fxsr_struct) ))
464 ret = -EFAULT;
465 /* mxcsr reserved bits must be masked to zero for security reasons */
466 tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
467 } else {
468 ret = -EIO;
469 }
470 return ret;
471}
472
473/*
474 * FPU state for core dumps.
475 */
476
477static inline void copy_fpu_fsave( struct task_struct *tsk,
478 struct user_i387_struct *fpu )
479{
480 memcpy( fpu, &tsk->thread.i387.fsave,
481 sizeof(struct user_i387_struct) );
482}
483
484static inline void copy_fpu_fxsave( struct task_struct *tsk,
485 struct user_i387_struct *fpu )
486{
487 unsigned short *to;
488 unsigned short *from;
489 int i;
490
491 memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) );
492
493 to = (unsigned short *)&fpu->st_space[0];
494 from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0];
495 for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) {
496 memcpy( to, from, 5 * sizeof(unsigned short) );
497 }
498}
499
500int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
501{
502 int fpvalid;
503 struct task_struct *tsk = current;
504
505 fpvalid = !!used_math();
506 if ( fpvalid ) {
507 unlazy_fpu( tsk );
508 if ( cpu_has_fxsr ) {
509 copy_fpu_fxsave( tsk, fpu );
510 } else {
511 copy_fpu_fsave( tsk, fpu );
512 }
513 }
514
515 return fpvalid;
516}
517EXPORT_SYMBOL(dump_fpu);
518
519int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
520{
521 int fpvalid = !!tsk_used_math(tsk);
522
523 if (fpvalid) {
524 if (tsk == current)
525 unlazy_fpu(tsk);
526 if (cpu_has_fxsr)
527 copy_fpu_fxsave(tsk, fpu);
528 else
529 copy_fpu_fsave(tsk, fpu);
530 }
531 return fpvalid;
532}
533
534int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu)
535{
536 int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr;
537
538 if (fpvalid) {
539 if (tsk == current)
540 unlazy_fpu(tsk);
541 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu));
542 }
543 return fpvalid;
544}
diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c
deleted file mode 100644
index bfaff28fb134..000000000000
--- a/arch/x86/kernel/i387_64.c
+++ /dev/null
@@ -1,150 +0,0 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 * Copyright (C) 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * General FPU state handling cleanups
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * x86-64 rework 2002 Andi Kleen.
10 * Does direct fxsave in and out of user space now for signal handlers.
11 * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
12 * the 64bit user space sees a FXSAVE frame directly.
13 */
14
15#include <linux/sched.h>
16#include <linux/init.h>
17#include <asm/processor.h>
18#include <asm/i387.h>
19#include <asm/sigcontext.h>
20#include <asm/user.h>
21#include <asm/ptrace.h>
22#include <asm/uaccess.h>
23
24unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff;
25
26void mxcsr_feature_mask_init(void)
27{
28 unsigned int mask;
29 clts();
30 memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
31 asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
32 mask = current->thread.i387.fxsave.mxcsr_mask;
33 if (mask == 0) mask = 0x0000ffbf;
34 mxcsr_feature_mask &= mask;
35 stts();
36}
37
38/*
39 * Called at bootup to set up the initial FPU state that is later cloned
40 * into all processes.
41 */
42void __cpuinit fpu_init(void)
43{
44 unsigned long oldcr0 = read_cr0();
45 extern void __bad_fxsave_alignment(void);
46
47 if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
48 __bad_fxsave_alignment();
49 set_in_cr4(X86_CR4_OSFXSR);
50 set_in_cr4(X86_CR4_OSXMMEXCPT);
51
52 write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
53
54 mxcsr_feature_mask_init();
55 /* clean state in init */
56 current_thread_info()->status = 0;
57 clear_used_math();
58}
59
60void init_fpu(struct task_struct *child)
61{
62 if (tsk_used_math(child)) {
63 if (child == current)
64 unlazy_fpu(child);
65 return;
66 }
67 memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
68 child->thread.i387.fxsave.cwd = 0x37f;
69 child->thread.i387.fxsave.mxcsr = 0x1f80;
70 /* only the device not available exception or ptrace can call init_fpu */
71 set_stopped_child_used_math(child);
72}
73
74/*
75 * Signal frame handlers.
76 */
77
78int save_i387(struct _fpstate __user *buf)
79{
80 struct task_struct *tsk = current;
81 int err = 0;
82
83 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
84 sizeof(tsk->thread.i387.fxsave));
85
86 if ((unsigned long)buf % 16)
87 printk("save_i387: bad fpstate %p\n",buf);
88
89 if (!used_math())
90 return 0;
91 clear_used_math(); /* trigger finit */
92 if (task_thread_info(tsk)->status & TS_USEDFPU) {
93 err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
94 if (err) return err;
95 task_thread_info(tsk)->status &= ~TS_USEDFPU;
96 stts();
97 } else {
98 if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
99 sizeof(struct i387_fxsave_struct)))
100 return -1;
101 }
102 return 1;
103}
104
105/*
106 * ptrace request handlers.
107 */
108
109int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
110{
111 init_fpu(tsk);
112 return __copy_to_user(buf, &tsk->thread.i387.fxsave,
113 sizeof(struct user_i387_struct)) ? -EFAULT : 0;
114}
115
116int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
117{
118 if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
119 sizeof(struct user_i387_struct)))
120 return -EFAULT;
121 return 0;
122}
123
124/*
125 * FPU state for core dumps.
126 */
127
128int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
129{
130 struct task_struct *tsk = current;
131
132 if (!used_math())
133 return 0;
134
135 unlazy_fpu(tsk);
136 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
137 return 1;
138}
139
140int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
141{
142 int fpvalid = !!tsk_used_math(tsk);
143
144 if (fpvalid) {
145 if (tsk == current)
146 unlazy_fpu(tsk);
147 memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
148}
149 return fpvalid;
150}
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index 29313832df0c..dbd6c1d1b638 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -51,7 +51,7 @@ static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
51} 51}
52 52
53static struct sysdev_class i8237_sysdev_class = { 53static struct sysdev_class i8237_sysdev_class = {
54 set_kset_name("i8237"), 54 .name = "i8237",
55 .suspend = i8237A_suspend, 55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume, 56 .resume = i8237A_resume,
57}; 57};
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index a42c80745325..ef62b07b2b48 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -13,10 +13,17 @@
13#include <asm/delay.h> 13#include <asm/delay.h>
14#include <asm/i8253.h> 14#include <asm/i8253.h>
15#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/hpet.h>
16 17
17DEFINE_SPINLOCK(i8253_lock); 18DEFINE_SPINLOCK(i8253_lock);
18EXPORT_SYMBOL(i8253_lock); 19EXPORT_SYMBOL(i8253_lock);
19 20
21#ifdef CONFIG_X86_32
22static void pit_disable_clocksource(void);
23#else
24static inline void pit_disable_clocksource(void) { }
25#endif
26
20/* 27/*
21 * HPET replaces the PIT, when enabled. So we need to know, which of 28 * HPET replaces the PIT, when enabled. So we need to know, which of
22 * the two timers is used 29 * the two timers is used
@@ -31,38 +38,38 @@ struct clock_event_device *global_clock_event;
31static void init_pit_timer(enum clock_event_mode mode, 38static void init_pit_timer(enum clock_event_mode mode,
32 struct clock_event_device *evt) 39 struct clock_event_device *evt)
33{ 40{
34 unsigned long flags; 41 spin_lock(&i8253_lock);
35
36 spin_lock_irqsave(&i8253_lock, flags);
37 42
38 switch(mode) { 43 switch(mode) {
39 case CLOCK_EVT_MODE_PERIODIC: 44 case CLOCK_EVT_MODE_PERIODIC:
40 /* binary, mode 2, LSB/MSB, ch 0 */ 45 /* binary, mode 2, LSB/MSB, ch 0 */
41 outb_p(0x34, PIT_MODE); 46 outb_pit(0x34, PIT_MODE);
42 outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ 47 outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */
43 outb(LATCH >> 8 , PIT_CH0); /* MSB */ 48 outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */
44 break; 49 break;
45 50
46 case CLOCK_EVT_MODE_SHUTDOWN: 51 case CLOCK_EVT_MODE_SHUTDOWN:
47 case CLOCK_EVT_MODE_UNUSED: 52 case CLOCK_EVT_MODE_UNUSED:
48 if (evt->mode == CLOCK_EVT_MODE_PERIODIC || 53 if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
49 evt->mode == CLOCK_EVT_MODE_ONESHOT) { 54 evt->mode == CLOCK_EVT_MODE_ONESHOT) {
50 outb_p(0x30, PIT_MODE); 55 outb_pit(0x30, PIT_MODE);
51 outb_p(0, PIT_CH0); 56 outb_pit(0, PIT_CH0);
52 outb_p(0, PIT_CH0); 57 outb_pit(0, PIT_CH0);
53 } 58 }
59 pit_disable_clocksource();
54 break; 60 break;
55 61
56 case CLOCK_EVT_MODE_ONESHOT: 62 case CLOCK_EVT_MODE_ONESHOT:
57 /* One shot setup */ 63 /* One shot setup */
58 outb_p(0x38, PIT_MODE); 64 pit_disable_clocksource();
65 outb_pit(0x38, PIT_MODE);
59 break; 66 break;
60 67
61 case CLOCK_EVT_MODE_RESUME: 68 case CLOCK_EVT_MODE_RESUME:
62 /* Nothing to do here */ 69 /* Nothing to do here */
63 break; 70 break;
64 } 71 }
65 spin_unlock_irqrestore(&i8253_lock, flags); 72 spin_unlock(&i8253_lock);
66} 73}
67 74
68/* 75/*
@@ -72,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode,
72 */ 79 */
73static int pit_next_event(unsigned long delta, struct clock_event_device *evt) 80static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
74{ 81{
75 unsigned long flags; 82 spin_lock(&i8253_lock);
76 83 outb_pit(delta & 0xff , PIT_CH0); /* LSB */
77 spin_lock_irqsave(&i8253_lock, flags); 84 outb_pit(delta >> 8 , PIT_CH0); /* MSB */
78 outb_p(delta & 0xff , PIT_CH0); /* LSB */ 85 spin_unlock(&i8253_lock);
79 outb(delta >> 8 , PIT_CH0); /* MSB */
80 spin_unlock_irqrestore(&i8253_lock, flags);
81 86
82 return 0; 87 return 0;
83} 88}
@@ -148,15 +153,15 @@ static cycle_t pit_read(void)
148 * count), it cannot be newer. 153 * count), it cannot be newer.
149 */ 154 */
150 jifs = jiffies; 155 jifs = jiffies;
151 outb_p(0x00, PIT_MODE); /* latch the count ASAP */ 156 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
152 count = inb_p(PIT_CH0); /* read the latched count */ 157 count = inb_pit(PIT_CH0); /* read the latched count */
153 count |= inb_p(PIT_CH0) << 8; 158 count |= inb_pit(PIT_CH0) << 8;
154 159
155 /* VIA686a test code... reset the latch if count > max + 1 */ 160 /* VIA686a test code... reset the latch if count > max + 1 */
156 if (count > LATCH) { 161 if (count > LATCH) {
157 outb_p(0x34, PIT_MODE); 162 outb_pit(0x34, PIT_MODE);
158 outb_p(LATCH & 0xff, PIT_CH0); 163 outb_pit(LATCH & 0xff, PIT_CH0);
159 outb(LATCH >> 8, PIT_CH0); 164 outb_pit(LATCH >> 8, PIT_CH0);
160 count = LATCH - 1; 165 count = LATCH - 1;
161 } 166 }
162 167
@@ -195,9 +200,28 @@ static struct clocksource clocksource_pit = {
195 .shift = 20, 200 .shift = 20,
196}; 201};
197 202
203static void pit_disable_clocksource(void)
204{
205 /*
206 * Use mult to check whether it is registered or not
207 */
208 if (clocksource_pit.mult) {
209 clocksource_unregister(&clocksource_pit);
210 clocksource_pit.mult = 0;
211 }
212}
213
198static int __init init_pit_clocksource(void) 214static int __init init_pit_clocksource(void)
199{ 215{
200 if (num_possible_cpus() > 1) /* PIT does not scale! */ 216 /*
217 * Several reasons not to register PIT as a clocksource:
218 *
219 * - On SMP PIT does not scale due to i8253_lock
220 * - when HPET is enabled
221 * - when local APIC timer is active (PIT is switched off)
222 */
223 if (num_possible_cpus() > 1 || is_hpet_enabled() ||
224 pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
201 return 0; 225 return 0;
202 226
203 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); 227 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c
index f634fc715c99..2d25b77102fe 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259_32.c
@@ -21,8 +21,6 @@
21#include <asm/arch_hooks.h> 21#include <asm/arch_hooks.h>
22#include <asm/i8259.h> 22#include <asm/i8259.h>
23 23
24#include <io_ports.h>
25
26/* 24/*
27 * This is the 'legacy' 8259A Programmable Interrupt Controller, 25 * This is the 'legacy' 8259A Programmable Interrupt Controller,
28 * present in the majority of PC/AT boxes. 26 * present in the majority of PC/AT boxes.
@@ -258,7 +256,7 @@ static int i8259A_shutdown(struct sys_device *dev)
258} 256}
259 257
260static struct sysdev_class i8259_sysdev_class = { 258static struct sysdev_class i8259_sysdev_class = {
261 set_kset_name("i8259"), 259 .name = "i8259",
262 .suspend = i8259A_suspend, 260 .suspend = i8259A_suspend,
263 .resume = i8259A_resume, 261 .resume = i8259A_resume,
264 .shutdown = i8259A_shutdown, 262 .shutdown = i8259A_shutdown,
@@ -291,20 +289,20 @@ void init_8259A(int auto_eoi)
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 289 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 290
293 /* 291 /*
294 * outb_p - this has to work on a wide range of PC hardware. 292 * outb_pic - this has to work on a wide range of PC hardware.
295 */ 293 */
296 outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ 294 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
297 outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ 295 outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
298 outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ 296 outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
299 if (auto_eoi) /* master does Auto EOI */ 297 if (auto_eoi) /* master does Auto EOI */
300 outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); 298 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
301 else /* master expects normal EOI */ 299 else /* master expects normal EOI */
302 outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); 300 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
303 301
304 outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ 302 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
305 outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ 303 outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
306 outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ 304 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
307 outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ 305 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
308 if (auto_eoi) 306 if (auto_eoi)
309 /* 307 /*
310 * In AEOI mode we just have to mask the interrupt 308 * In AEOI mode we just have to mask the interrupt
@@ -341,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
341 outb(0,0xF0); 339 outb(0,0xF0);
342 if (ignore_fpu_irq || !boot_cpu_data.hard_math) 340 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
343 return IRQ_NONE; 341 return IRQ_NONE;
344 math_error((void __user *)get_irq_regs()->eip); 342 math_error((void __user *)get_irq_regs()->ip);
345 return IRQ_HANDLED; 343 return IRQ_HANDLED;
346} 344}
347 345
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index 3f27ea0b9816..fa57a1568508 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -21,6 +21,7 @@
21#include <asm/delay.h> 21#include <asm/delay.h>
22#include <asm/desc.h> 22#include <asm/desc.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/i8259.h>
24 25
25/* 26/*
26 * Common place to define all x86 IRQ vectors 27 * Common place to define all x86 IRQ vectors
@@ -48,7 +49,7 @@
48 */ 49 */
49 50
50/* 51/*
51 * The IO-APIC gives us many more interrupt sources. Most of these 52 * The IO-APIC gives us many more interrupt sources. Most of these
52 * are unused but an SMP system is supposed to have enough memory ... 53 * are unused but an SMP system is supposed to have enough memory ...
53 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all 54 * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
54 * across the spectrum, so we really want to be prepared to get all 55 * across the spectrum, so we really want to be prepared to get all
@@ -76,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
76 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) 77 IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
77 78
78/* for the irq vectors */ 79/* for the irq vectors */
79static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { 80static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
80 IRQLIST_16(0x2), IRQLIST_16(0x3), 81 IRQLIST_16(0x2), IRQLIST_16(0x3),
81 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), 82 IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
82 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), 83 IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
@@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = {
114/* 115/*
115 * This contains the irq mask for both 8259A irq controllers, 116 * This contains the irq mask for both 8259A irq controllers,
116 */ 117 */
117static unsigned int cached_irq_mask = 0xffff; 118unsigned int cached_irq_mask = 0xffff;
118
119#define __byte(x,y) (((unsigned char *)&(y))[x])
120#define cached_21 (__byte(0,cached_irq_mask))
121#define cached_A1 (__byte(1,cached_irq_mask))
122 119
123/* 120/*
124 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) 121 * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
@@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq)
139 spin_lock_irqsave(&i8259A_lock, flags); 136 spin_lock_irqsave(&i8259A_lock, flags);
140 cached_irq_mask |= mask; 137 cached_irq_mask |= mask;
141 if (irq & 8) 138 if (irq & 8)
142 outb(cached_A1,0xA1); 139 outb(cached_slave_mask, PIC_SLAVE_IMR);
143 else 140 else
144 outb(cached_21,0x21); 141 outb(cached_master_mask, PIC_MASTER_IMR);
145 spin_unlock_irqrestore(&i8259A_lock, flags); 142 spin_unlock_irqrestore(&i8259A_lock, flags);
146} 143}
147 144
@@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq)
153 spin_lock_irqsave(&i8259A_lock, flags); 150 spin_lock_irqsave(&i8259A_lock, flags);
154 cached_irq_mask &= mask; 151 cached_irq_mask &= mask;
155 if (irq & 8) 152 if (irq & 8)
156 outb(cached_A1,0xA1); 153 outb(cached_slave_mask, PIC_SLAVE_IMR);
157 else 154 else
158 outb(cached_21,0x21); 155 outb(cached_master_mask, PIC_MASTER_IMR);
159 spin_unlock_irqrestore(&i8259A_lock, flags); 156 spin_unlock_irqrestore(&i8259A_lock, flags);
160} 157}
161 158
@@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq)
167 164
168 spin_lock_irqsave(&i8259A_lock, flags); 165 spin_lock_irqsave(&i8259A_lock, flags);
169 if (irq < 8) 166 if (irq < 8)
170 ret = inb(0x20) & mask; 167 ret = inb(PIC_MASTER_CMD) & mask;
171 else 168 else
172 ret = inb(0xA0) & (mask >> 8); 169 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
173 spin_unlock_irqrestore(&i8259A_lock, flags); 170 spin_unlock_irqrestore(&i8259A_lock, flags);
174 171
175 return ret; 172 return ret;
@@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq)
196 int irqmask = 1<<irq; 193 int irqmask = 1<<irq;
197 194
198 if (irq < 8) { 195 if (irq < 8) {
199 outb(0x0B,0x20); /* ISR register */ 196 outb(0x0B,PIC_MASTER_CMD); /* ISR register */
200 value = inb(0x20) & irqmask; 197 value = inb(PIC_MASTER_CMD) & irqmask;
201 outb(0x0A,0x20); /* back to the IRR register */ 198 outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
202 return value; 199 return value;
203 } 200 }
204 outb(0x0B,0xA0); /* ISR register */ 201 outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
205 value = inb(0xA0) & (irqmask >> 8); 202 value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
206 outb(0x0A,0xA0); /* back to the IRR register */ 203 outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
207 return value; 204 return value;
208} 205}
209 206
@@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq)
240 237
241handle_real_irq: 238handle_real_irq:
242 if (irq & 8) { 239 if (irq & 8) {
243 inb(0xA1); /* DUMMY - (do we need this?) */ 240 inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
244 outb(cached_A1,0xA1); 241 outb(cached_slave_mask, PIC_SLAVE_IMR);
245 outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ 242 /* 'Specific EOI' to slave */
246 outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ 243 outb(0x60+(irq&7),PIC_SLAVE_CMD);
244 /* 'Specific EOI' to master-IRQ2 */
245 outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
247 } else { 246 } else {
248 inb(0x21); /* DUMMY - (do we need this?) */ 247 inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
249 outb(cached_21,0x21); 248 outb(cached_master_mask, PIC_MASTER_IMR);
250 outb(0x60+irq,0x20); /* 'Specific EOI' to master */ 249 /* 'Specific EOI' to master */
250 outb(0x60+irq,PIC_MASTER_CMD);
251 } 251 }
252 spin_unlock_irqrestore(&i8259A_lock, flags); 252 spin_unlock_irqrestore(&i8259A_lock, flags);
253 return; 253 return;
@@ -270,7 +270,8 @@ spurious_8259A_irq:
270 * lets ACK and report it. [once per IRQ] 270 * lets ACK and report it. [once per IRQ]
271 */ 271 */
272 if (!(spurious_irq_mask & irqmask)) { 272 if (!(spurious_irq_mask & irqmask)) {
273 printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); 273 printk(KERN_DEBUG
274 "spurious 8259A interrupt: IRQ%d.\n", irq);
274 spurious_irq_mask |= irqmask; 275 spurious_irq_mask |= irqmask;
275 } 276 }
276 atomic_inc(&irq_err_count); 277 atomic_inc(&irq_err_count);
@@ -283,51 +284,6 @@ spurious_8259A_irq:
283 } 284 }
284} 285}
285 286
286void init_8259A(int auto_eoi)
287{
288 unsigned long flags;
289
290 i8259A_auto_eoi = auto_eoi;
291
292 spin_lock_irqsave(&i8259A_lock, flags);
293
294 outb(0xff, 0x21); /* mask all of 8259A-1 */
295 outb(0xff, 0xA1); /* mask all of 8259A-2 */
296
297 /*
298 * outb_p - this has to work on a wide range of PC hardware.
299 */
300 outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
301 outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
302 outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
303 if (auto_eoi)
304 outb_p(0x03, 0x21); /* master does Auto EOI */
305 else
306 outb_p(0x01, 0x21); /* master expects normal EOI */
307
308 outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
309 outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
310 outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
311 outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
312 is to be investigated) */
313
314 if (auto_eoi)
315 /*
316 * in AEOI mode we just have to mask the interrupt
317 * when acking.
318 */
319 i8259A_chip.mask_ack = disable_8259A_irq;
320 else
321 i8259A_chip.mask_ack = mask_and_ack_8259A;
322
323 udelay(100); /* wait for 8259A to initialize */
324
325 outb(cached_21, 0x21); /* restore master IRQ mask */
326 outb(cached_A1, 0xA1); /* restore slave IRQ mask */
327
328 spin_unlock_irqrestore(&i8259A_lock, flags);
329}
330
331static char irq_trigger[2]; 287static char irq_trigger[2];
332/** 288/**
333 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ 289 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -364,13 +320,13 @@ static int i8259A_shutdown(struct sys_device *dev)
364 * the kernel initialization code can get it 320 * the kernel initialization code can get it
365 * out of. 321 * out of.
366 */ 322 */
367 outb(0xff, 0x21); /* mask all of 8259A-1 */ 323 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
368 outb(0xff, 0xA1); /* mask all of 8259A-1 */ 324 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
369 return 0; 325 return 0;
370} 326}
371 327
372static struct sysdev_class i8259_sysdev_class = { 328static struct sysdev_class i8259_sysdev_class = {
373 set_kset_name("i8259"), 329 .name = "i8259",
374 .suspend = i8259A_suspend, 330 .suspend = i8259A_suspend,
375 .resume = i8259A_resume, 331 .resume = i8259A_resume,
376 .shutdown = i8259A_shutdown, 332 .shutdown = i8259A_shutdown,
@@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void)
391 347
392device_initcall(i8259A_init_sysfs); 348device_initcall(i8259A_init_sysfs);
393 349
350void init_8259A(int auto_eoi)
351{
352 unsigned long flags;
353
354 i8259A_auto_eoi = auto_eoi;
355
356 spin_lock_irqsave(&i8259A_lock, flags);
357
358 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
359 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
360
361 /*
362 * outb_pic - this has to work on a wide range of PC hardware.
363 */
364 outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
365 /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
366 outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
367 /* 8259A-1 (the master) has a slave on IR2 */
368 outb_pic(0x04, PIC_MASTER_IMR);
369 if (auto_eoi) /* master does Auto EOI */
370 outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
371 else /* master expects normal EOI */
372 outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
373
374 outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
375 /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
376 outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
377 /* 8259A-2 is a slave on master's IR2 */
378 outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
379 /* (slave's support for AEOI in flat mode is to be investigated) */
380 outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
381
382 if (auto_eoi)
383 /*
384 * In AEOI mode we just have to mask the interrupt
385 * when acking.
386 */
387 i8259A_chip.mask_ack = disable_8259A_irq;
388 else
389 i8259A_chip.mask_ack = mask_and_ack_8259A;
390
391 udelay(100); /* wait for 8259A to initialize */
392
393 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
394 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
395
396 spin_unlock_irqrestore(&i8259A_lock, flags);
397}
398
399
400
401
394/* 402/*
395 * IRQ2 is cascade interrupt to second interrupt controller 403 * IRQ2 is cascade interrupt to second interrupt controller
396 */ 404 */
@@ -448,7 +456,9 @@ void __init init_ISA_irqs (void)
448 } 456 }
449} 457}
450 458
451void __init init_IRQ(void) 459void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
460
461void __init native_init_IRQ(void)
452{ 462{
453 int i; 463 int i;
454 464
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 468c9c437842..5b3ce7934363 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES;
15static struct signal_struct init_signals = INIT_SIGNALS(init_signals); 15static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); 16static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
17struct mm_struct init_mm = INIT_MM(init_mm); 17struct mm_struct init_mm = INIT_MM(init_mm);
18EXPORT_SYMBOL(init_mm);
19 18
20/* 19/*
21 * Initial thread structure. 20 * Initial thread structure.
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index c3a565bba106..4ca548632c8d 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -35,6 +35,7 @@
35#include <linux/htirq.h> 35#include <linux/htirq.h>
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */
38 39
39#include <asm/io.h> 40#include <asm/io.h>
40#include <asm/smp.h> 41#include <asm/smp.h>
@@ -48,8 +49,6 @@
48#include <mach_apic.h> 49#include <mach_apic.h>
49#include <mach_apicdef.h> 50#include <mach_apicdef.h>
50 51
51#include "io_ports.h"
52
53int (*ioapic_renumber_irq)(int ioapic, int irq); 52int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count; 53atomic_t irq_mis_count;
55 54
@@ -351,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
351# include <asm/processor.h> /* kernel_thread() */ 350# include <asm/processor.h> /* kernel_thread() */
352# include <linux/kernel_stat.h> /* kstat */ 351# include <linux/kernel_stat.h> /* kstat */
353# include <linux/slab.h> /* kmalloc() */ 352# include <linux/slab.h> /* kmalloc() */
354# include <linux/timer.h> /* time_after() */ 353# include <linux/timer.h>
355 354
356#define IRQBALANCE_CHECK_ARCH -999 355#define IRQBALANCE_CHECK_ARCH -999
357#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) 356#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
@@ -727,7 +726,7 @@ late_initcall(balanced_irq_init);
727#endif /* CONFIG_SMP */ 726#endif /* CONFIG_SMP */
728 727
729#ifndef CONFIG_SMP 728#ifndef CONFIG_SMP
730void fastcall send_IPI_self(int vector) 729void send_IPI_self(int vector)
731{ 730{
732 unsigned int cfg; 731 unsigned int cfg;
733 732
@@ -1900,7 +1899,7 @@ static int __init timer_irq_works(void)
1900 * might have cached one ExtINT interrupt. Finally, at 1899 * might have cached one ExtINT interrupt. Finally, at
1901 * least one tick may be lost due to delays. 1900 * least one tick may be lost due to delays.
1902 */ 1901 */
1903 if (jiffies - t1 > 4) 1902 if (time_after(jiffies, t1 + 4))
1904 return 1; 1903 return 1;
1905 1904
1906 return 0; 1905 return 0;
@@ -2080,7 +2079,7 @@ static struct irq_chip lapic_chip __read_mostly = {
2080 .eoi = ack_apic, 2079 .eoi = ack_apic,
2081}; 2080};
2082 2081
2083static void setup_nmi (void) 2082static void __init setup_nmi(void)
2084{ 2083{
2085 /* 2084 /*
2086 * Dirty trick to enable the NMI watchdog ... 2085 * Dirty trick to enable the NMI watchdog ...
@@ -2093,7 +2092,7 @@ static void setup_nmi (void)
2093 */ 2092 */
2094 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); 2093 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2095 2094
2096 on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); 2095 enable_NMI_through_LVT0();
2097 2096
2098 apic_printk(APIC_VERBOSE, " done.\n"); 2097 apic_printk(APIC_VERBOSE, " done.\n");
2099} 2098}
@@ -2169,14 +2168,10 @@ static inline void __init check_timer(void)
2169{ 2168{
2170 int apic1, pin1, apic2, pin2; 2169 int apic1, pin1, apic2, pin2;
2171 int vector; 2170 int vector;
2172 unsigned int ver;
2173 unsigned long flags; 2171 unsigned long flags;
2174 2172
2175 local_irq_save(flags); 2173 local_irq_save(flags);
2176 2174
2177 ver = apic_read(APIC_LVR);
2178 ver = GET_APIC_VERSION(ver);
2179
2180 /* 2175 /*
2181 * get/set the timer IRQ vector: 2176 * get/set the timer IRQ vector:
2182 */ 2177 */
@@ -2189,15 +2184,11 @@ static inline void __init check_timer(void)
2189 * mode for the 8259A whenever interrupts are routed 2184 * mode for the 8259A whenever interrupts are routed
2190 * through I/O APICs. Also IRQ0 has to be enabled in 2185 * through I/O APICs. Also IRQ0 has to be enabled in
2191 * the 8259A which implies the virtual wire has to be 2186 * the 8259A which implies the virtual wire has to be
2192 * disabled in the local APIC. Finally timer interrupts 2187 * disabled in the local APIC.
2193 * need to be acknowledged manually in the 8259A for
2194 * timer_interrupt() and for the i82489DX when using
2195 * the NMI watchdog.
2196 */ 2188 */
2197 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2189 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2198 init_8259A(1); 2190 init_8259A(1);
2199 timer_ack = !cpu_has_tsc; 2191 timer_ack = 1;
2200 timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2201 if (timer_over_8254 > 0) 2192 if (timer_over_8254 > 0)
2202 enable_8259A_irq(0); 2193 enable_8259A_irq(0);
2203 2194
@@ -2409,7 +2400,7 @@ static int ioapic_resume(struct sys_device *dev)
2409} 2400}
2410 2401
2411static struct sysdev_class ioapic_sysdev_class = { 2402static struct sysdev_class ioapic_sysdev_class = {
2412 set_kset_name("ioapic"), 2403 .name = "ioapic",
2413 .suspend = ioapic_suspend, 2404 .suspend = ioapic_suspend,
2414 .resume = ioapic_resume, 2405 .resume = ioapic_resume,
2415}; 2406};
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index cbac1670c7c3..1627c0d53e0b 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -32,9 +32,11 @@
32#include <linux/msi.h> 32#include <linux/msi.h>
33#include <linux/htirq.h> 33#include <linux/htirq.h>
34#include <linux/dmar.h> 34#include <linux/dmar.h>
35#include <linux/jiffies.h>
35#ifdef CONFIG_ACPI 36#ifdef CONFIG_ACPI
36#include <acpi/acpi_bus.h> 37#include <acpi/acpi_bus.h>
37#endif 38#endif
39#include <linux/bootmem.h>
38 40
39#include <asm/idle.h> 41#include <asm/idle.h>
40#include <asm/io.h> 42#include <asm/io.h>
@@ -1069,7 +1071,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1069 v = apic_read(APIC_LVR); 1071 v = apic_read(APIC_LVR);
1070 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1072 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1071 ver = GET_APIC_VERSION(v); 1073 ver = GET_APIC_VERSION(v);
1072 maxlvt = get_maxlvt(); 1074 maxlvt = lapic_get_maxlvt();
1073 1075
1074 v = apic_read(APIC_TASKPRI); 1076 v = apic_read(APIC_TASKPRI);
1075 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1077 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
@@ -1171,7 +1173,7 @@ void __apicdebuginit print_PIC(void)
1171 1173
1172#endif /* 0 */ 1174#endif /* 0 */
1173 1175
1174static void __init enable_IO_APIC(void) 1176void __init enable_IO_APIC(void)
1175{ 1177{
1176 union IO_APIC_reg_01 reg_01; 1178 union IO_APIC_reg_01 reg_01;
1177 int i8259_apic, i8259_pin; 1179 int i8259_apic, i8259_pin;
@@ -1298,7 +1300,7 @@ static int __init timer_irq_works(void)
1298 */ 1300 */
1299 1301
1300 /* jiffies wrap? */ 1302 /* jiffies wrap? */
1301 if (jiffies - t1 > 4) 1303 if (time_after(jiffies, t1 + 4))
1302 return 1; 1304 return 1;
1303 return 0; 1305 return 0;
1304} 1306}
@@ -1411,7 +1413,7 @@ static void irq_complete_move(unsigned int irq)
1411 if (likely(!cfg->move_in_progress)) 1413 if (likely(!cfg->move_in_progress))
1412 return; 1414 return;
1413 1415
1414 vector = ~get_irq_regs()->orig_rax; 1416 vector = ~get_irq_regs()->orig_ax;
1415 me = smp_processor_id(); 1417 me = smp_processor_id();
1416 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { 1418 if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
1417 cpumask_t cleanup_mask; 1419 cpumask_t cleanup_mask;
@@ -1438,7 +1440,7 @@ static void ack_apic_level(unsigned int irq)
1438 int do_unmask_irq = 0; 1440 int do_unmask_irq = 0;
1439 1441
1440 irq_complete_move(irq); 1442 irq_complete_move(irq);
1441#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) 1443#ifdef CONFIG_GENERIC_PENDING_IRQ
1442 /* If we are moving the irq we need to mask it */ 1444 /* If we are moving the irq we need to mask it */
1443 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { 1445 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
1444 do_unmask_irq = 1; 1446 do_unmask_irq = 1;
@@ -1565,7 +1567,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = {
1565 .end = end_lapic_irq, 1567 .end = end_lapic_irq,
1566}; 1568};
1567 1569
1568static void setup_nmi (void) 1570static void __init setup_nmi(void)
1569{ 1571{
1570 /* 1572 /*
1571 * Dirty trick to enable the NMI watchdog ... 1573 * Dirty trick to enable the NMI watchdog ...
@@ -1578,7 +1580,7 @@ static void setup_nmi (void)
1578 */ 1580 */
1579 printk(KERN_INFO "activating NMI Watchdog ..."); 1581 printk(KERN_INFO "activating NMI Watchdog ...");
1580 1582
1581 enable_NMI_through_LVT0(NULL); 1583 enable_NMI_through_LVT0();
1582 1584
1583 printk(" done.\n"); 1585 printk(" done.\n");
1584} 1586}
@@ -1654,7 +1656,7 @@ static inline void unlock_ExtINT_logic(void)
1654 * 1656 *
1655 * FIXME: really need to revamp this for modern platforms only. 1657 * FIXME: really need to revamp this for modern platforms only.
1656 */ 1658 */
1657static inline void check_timer(void) 1659static inline void __init check_timer(void)
1658{ 1660{
1659 struct irq_cfg *cfg = irq_cfg + 0; 1661 struct irq_cfg *cfg = irq_cfg + 0;
1660 int apic1, pin1, apic2, pin2; 1662 int apic1, pin1, apic2, pin2;
@@ -1788,7 +1790,10 @@ __setup("no_timer_check", notimercheck);
1788 1790
1789void __init setup_IO_APIC(void) 1791void __init setup_IO_APIC(void)
1790{ 1792{
1791 enable_IO_APIC(); 1793
1794 /*
1795 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1796 */
1792 1797
1793 if (acpi_ioapic) 1798 if (acpi_ioapic)
1794 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ 1799 io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
@@ -1850,7 +1855,7 @@ static int ioapic_resume(struct sys_device *dev)
1850} 1855}
1851 1856
1852static struct sysdev_class ioapic_sysdev_class = { 1857static struct sysdev_class ioapic_sysdev_class = {
1853 set_kset_name("ioapic"), 1858 .name = "ioapic",
1854 .suspend = ioapic_suspend, 1859 .suspend = ioapic_suspend,
1855 .resume = ioapic_resume, 1860 .resume = ioapic_resume,
1856}; 1861};
@@ -2288,3 +2293,92 @@ void __init setup_ioapic_dest(void)
2288} 2293}
2289#endif 2294#endif
2290 2295
2296#define IOAPIC_RESOURCE_NAME_SIZE 11
2297
2298static struct resource *ioapic_resources;
2299
2300static struct resource * __init ioapic_setup_resources(void)
2301{
2302 unsigned long n;
2303 struct resource *res;
2304 char *mem;
2305 int i;
2306
2307 if (nr_ioapics <= 0)
2308 return NULL;
2309
2310 n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
2311 n *= nr_ioapics;
2312
2313 mem = alloc_bootmem(n);
2314 res = (void *)mem;
2315
2316 if (mem != NULL) {
2317 memset(mem, 0, n);
2318 mem += sizeof(struct resource) * nr_ioapics;
2319
2320 for (i = 0; i < nr_ioapics; i++) {
2321 res[i].name = mem;
2322 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
2323 sprintf(mem, "IOAPIC %u", i);
2324 mem += IOAPIC_RESOURCE_NAME_SIZE;
2325 }
2326 }
2327
2328 ioapic_resources = res;
2329
2330 return res;
2331}
2332
2333void __init ioapic_init_mappings(void)
2334{
2335 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2336 struct resource *ioapic_res;
2337 int i;
2338
2339 ioapic_res = ioapic_setup_resources();
2340 for (i = 0; i < nr_ioapics; i++) {
2341 if (smp_found_config) {
2342 ioapic_phys = mp_ioapics[i].mpc_apicaddr;
2343 } else {
2344 ioapic_phys = (unsigned long)
2345 alloc_bootmem_pages(PAGE_SIZE);
2346 ioapic_phys = __pa(ioapic_phys);
2347 }
2348 set_fixmap_nocache(idx, ioapic_phys);
2349 apic_printk(APIC_VERBOSE,
2350 "mapped IOAPIC to %016lx (%016lx)\n",
2351 __fix_to_virt(idx), ioapic_phys);
2352 idx++;
2353
2354 if (ioapic_res != NULL) {
2355 ioapic_res->start = ioapic_phys;
2356 ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
2357 ioapic_res++;
2358 }
2359 }
2360}
2361
2362static int __init ioapic_insert_resources(void)
2363{
2364 int i;
2365 struct resource *r = ioapic_resources;
2366
2367 if (!r) {
2368 printk(KERN_ERR
2369 "IO APIC resources could be not be allocated.\n");
2370 return -1;
2371 }
2372
2373 for (i = 0; i < nr_ioapics; i++) {
2374 insert_resource(&iomem_resource, r);
2375 r++;
2376 }
2377
2378 return 0;
2379}
2380
2381/* Insert the IO APIC resources after PCI initialization has occured to handle
2382 * IO APICS that are mapped in on a BAR in PCI space. */
2383late_initcall(ioapic_insert_resources);
2384
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
new file mode 100644
index 000000000000..bd49321034db
--- /dev/null
+++ b/arch/x86/kernel/io_delay.c
@@ -0,0 +1,114 @@
1/*
2 * I/O delay strategies for inb_p/outb_p
3 *
4 * Allow for a DMI based override of port 0x80, needed for certain HP laptops
5 * and possibly other systems. Also allow for the gradual elimination of
6 * outb_p/inb_p API uses.
7 */
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/delay.h>
12#include <linux/dmi.h>
13#include <asm/io.h>
14
15int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE;
16EXPORT_SYMBOL_GPL(io_delay_type);
17
18static int __initdata io_delay_override;
19
20/*
21 * Paravirt wants native_io_delay to be a constant.
22 */
23void native_io_delay(void)
24{
25 switch (io_delay_type) {
26 default:
27 case CONFIG_IO_DELAY_TYPE_0X80:
28 asm volatile ("outb %al, $0x80");
29 break;
30 case CONFIG_IO_DELAY_TYPE_0XED:
31 asm volatile ("outb %al, $0xed");
32 break;
33 case CONFIG_IO_DELAY_TYPE_UDELAY:
34 /*
35 * 2 usecs is an upper-bound for the outb delay but
36 * note that udelay doesn't have the bus-level
37 * side-effects that outb does, nor does udelay() have
38 * precise timings during very early bootup (the delays
39 * are shorter until calibrated):
40 */
41 udelay(2);
42 case CONFIG_IO_DELAY_TYPE_NONE:
43 break;
44 }
45}
46EXPORT_SYMBOL(native_io_delay);
47
48static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id)
49{
50 if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) {
51 printk(KERN_NOTICE "%s: using 0xed I/O delay port\n",
52 id->ident);
53 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
54 }
55
56 return 0;
57}
58
59/*
60 * Quirk table for systems that misbehave (lock up, etc.) if port
61 * 0x80 is used:
62 */
63static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
64 {
65 .callback = dmi_io_delay_0xed_port,
66 .ident = "Compaq Presario V6000",
67 .matches = {
68 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
69 DMI_MATCH(DMI_BOARD_NAME, "30B7")
70 }
71 },
72 {
73 .callback = dmi_io_delay_0xed_port,
74 .ident = "HP Pavilion dv9000z",
75 .matches = {
76 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
77 DMI_MATCH(DMI_BOARD_NAME, "30B9")
78 }
79 },
80 {
81 .callback = dmi_io_delay_0xed_port,
82 .ident = "HP Pavilion tx1000",
83 .matches = {
84 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
85 DMI_MATCH(DMI_BOARD_NAME, "30BF")
86 }
87 },
88 { }
89};
90
91void __init io_delay_init(void)
92{
93 if (!io_delay_override)
94 dmi_check_system(io_delay_0xed_port_dmi_table);
95}
96
97static int __init io_delay_param(char *s)
98{
99 if (!strcmp(s, "0x80"))
100 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
101 else if (!strcmp(s, "0xed"))
102 io_delay_type = CONFIG_IO_DELAY_TYPE_0XED;
103 else if (!strcmp(s, "udelay"))
104 io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY;
105 else if (!strcmp(s, "none"))
106 io_delay_type = CONFIG_IO_DELAY_TYPE_NONE;
107 else
108 return -EINVAL;
109
110 io_delay_override = 1;
111 return 0;
112}
113
114early_param("io_delay", io_delay_param);
diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport.c
index 4ed48dc8df1e..50e5e4a31c85 100644
--- a/arch/x86/kernel/ioport_32.c
+++ b/arch/x86/kernel/ioport.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This contains the io-permission bitmap code - written by obz, with changes 2 * This contains the io-permission bitmap code - written by obz, with changes
3 * by Linus. 3 * by Linus. 32/64 bits code unification by Miguel Botón.
4 */ 4 */
5 5
6#include <linux/sched.h> 6#include <linux/sched.h>
@@ -16,49 +16,27 @@
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17 17
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) 19static void set_bitmap(unsigned long *bitmap, unsigned int base,
20 unsigned int extent, int new_value)
20{ 21{
21 unsigned long mask; 22 unsigned int i;
22 unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
23 unsigned int low_index = base & (BITS_PER_LONG-1);
24 int length = low_index + extent;
25
26 if (low_index != 0) {
27 mask = (~0UL << low_index);
28 if (length < BITS_PER_LONG)
29 mask &= ~(~0UL << length);
30 if (new_value)
31 *bitmap_base++ |= mask;
32 else
33 *bitmap_base++ &= ~mask;
34 length -= BITS_PER_LONG;
35 }
36
37 mask = (new_value ? ~0UL : 0UL);
38 while (length >= BITS_PER_LONG) {
39 *bitmap_base++ = mask;
40 length -= BITS_PER_LONG;
41 }
42 23
43 if (length > 0) { 24 for (i = base; i < base + extent; i++) {
44 mask = ~(~0UL << length);
45 if (new_value) 25 if (new_value)
46 *bitmap_base++ |= mask; 26 __set_bit(i, bitmap);
47 else 27 else
48 *bitmap_base++ &= ~mask; 28 __clear_bit(i, bitmap);
49 } 29 }
50} 30}
51 31
52
53/* 32/*
54 * this changes the io permissions bitmap in the current task. 33 * this changes the io permissions bitmap in the current task.
55 */ 34 */
56asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) 35asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
57{ 36{
58 unsigned long i, max_long, bytes, bytes_updated;
59 struct thread_struct * t = &current->thread; 37 struct thread_struct * t = &current->thread;
60 struct tss_struct * tss; 38 struct tss_struct * tss;
61 unsigned long *bitmap; 39 unsigned int i, max_long, bytes, bytes_updated;
62 40
63 if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) 41 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
64 return -EINVAL; 42 return -EINVAL;
@@ -71,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
71 * this is why we delay this operation until now: 49 * this is why we delay this operation until now:
72 */ 50 */
73 if (!t->io_bitmap_ptr) { 51 if (!t->io_bitmap_ptr) {
74 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 52 unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
53
75 if (!bitmap) 54 if (!bitmap)
76 return -ENOMEM; 55 return -ENOMEM;
77 56
@@ -100,11 +79,12 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
100 if (t->io_bitmap_ptr[i] != ~0UL) 79 if (t->io_bitmap_ptr[i] != ~0UL)
101 max_long = i; 80 max_long = i;
102 81
103 bytes = (max_long + 1) * sizeof(long); 82 bytes = (max_long + 1) * sizeof(unsigned long);
104 bytes_updated = max(bytes, t->io_bitmap_max); 83 bytes_updated = max(bytes, t->io_bitmap_max);
105 84
106 t->io_bitmap_max = bytes; 85 t->io_bitmap_max = bytes;
107 86
87#ifdef CONFIG_X86_32
108 /* 88 /*
109 * Sets the lazy trigger so that the next I/O operation will 89 * Sets the lazy trigger so that the next I/O operation will
110 * reload the correct bitmap. 90 * reload the correct bitmap.
@@ -113,6 +93,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
113 */ 93 */
114 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; 94 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
115 tss->io_bitmap_owner = NULL; 95 tss->io_bitmap_owner = NULL;
96#else
97 /* Update the TSS: */
98 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
99#endif
116 100
117 put_cpu(); 101 put_cpu();
118 102
@@ -124,18 +108,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
124 * beyond the 0x3ff range: to get the full 65536 ports bitmapped 108 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
125 * you'd need 8kB of bitmaps/process, which is a bit excessive. 109 * you'd need 8kB of bitmaps/process, which is a bit excessive.
126 * 110 *
127 * Here we just change the eflags value on the stack: we allow 111 * Here we just change the flags value on the stack: we allow
128 * only the super-user to do it. This depends on the stack-layout 112 * only the super-user to do it. This depends on the stack-layout
129 * on system-call entry - see also fork() and the signal handling 113 * on system-call entry - see also fork() and the signal handling
130 * code. 114 * code.
131 */ 115 */
132 116static int do_iopl(unsigned int level, struct pt_regs *regs)
133asmlinkage long sys_iopl(unsigned long unused)
134{ 117{
135 volatile struct pt_regs * regs = (struct pt_regs *) &unused; 118 unsigned int old = (regs->flags >> 12) & 3;
136 unsigned int level = regs->ebx;
137 unsigned int old = (regs->eflags >> 12) & 3;
138 struct thread_struct *t = &current->thread;
139 119
140 if (level > 3) 120 if (level > 3)
141 return -EINVAL; 121 return -EINVAL;
@@ -144,8 +124,31 @@ asmlinkage long sys_iopl(unsigned long unused)
144 if (!capable(CAP_SYS_RAWIO)) 124 if (!capable(CAP_SYS_RAWIO))
145 return -EPERM; 125 return -EPERM;
146 } 126 }
127 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
128
129 return 0;
130}
131
132#ifdef CONFIG_X86_32
133asmlinkage long sys_iopl(unsigned long regsp)
134{
135 struct pt_regs *regs = (struct pt_regs *)&regsp;
136 unsigned int level = regs->bx;
137 struct thread_struct *t = &current->thread;
138 int rc;
139
140 rc = do_iopl(level, regs);
141 if (rc < 0)
142 goto out;
143
147 t->iopl = level << 12; 144 t->iopl = level << 12;
148 regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
149 set_iopl_mask(t->iopl); 145 set_iopl_mask(t->iopl);
150 return 0; 146out:
147 return rc;
148}
149#else
150asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
151{
152 return do_iopl(level, regs);
151} 153}
154#endif
diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c
deleted file mode 100644
index 5f62fad64dab..000000000000
--- a/arch/x86/kernel/ioport_64.c
+++ /dev/null
@@ -1,117 +0,0 @@
1/*
2 * This contains the io-permission bitmap code - written by obz, with changes
3 * by Linus.
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/capability.h>
9#include <linux/errno.h>
10#include <linux/types.h>
11#include <linux/ioport.h>
12#include <linux/smp.h>
13#include <linux/stddef.h>
14#include <linux/slab.h>
15#include <linux/thread_info.h>
16#include <linux/syscalls.h>
17
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
20{
21 int i;
22 if (new_value)
23 for (i = base; i < base + extent; i++)
24 __set_bit(i, bitmap);
25 else
26 for (i = base; i < base + extent; i++)
27 clear_bit(i, bitmap);
28}
29
30/*
31 * this changes the io permissions bitmap in the current task.
32 */
33asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
34{
35 unsigned int i, max_long, bytes, bytes_updated;
36 struct thread_struct * t = &current->thread;
37 struct tss_struct * tss;
38 unsigned long *bitmap;
39
40 if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
41 return -EINVAL;
42 if (turn_on && !capable(CAP_SYS_RAWIO))
43 return -EPERM;
44
45 /*
46 * If it's the first ioperm() call in this thread's lifetime, set the
47 * IO bitmap up. ioperm() is much less timing critical than clone(),
48 * this is why we delay this operation until now:
49 */
50 if (!t->io_bitmap_ptr) {
51 bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
52 if (!bitmap)
53 return -ENOMEM;
54
55 memset(bitmap, 0xff, IO_BITMAP_BYTES);
56 t->io_bitmap_ptr = bitmap;
57 set_thread_flag(TIF_IO_BITMAP);
58 }
59
60 /*
61 * do it in the per-thread copy and in the TSS ...
62 *
63 * Disable preemption via get_cpu() - we must not switch away
64 * because the ->io_bitmap_max value must match the bitmap
65 * contents:
66 */
67 tss = &per_cpu(init_tss, get_cpu());
68
69 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
70
71 /*
72 * Search for a (possibly new) maximum. This is simple and stupid,
73 * to keep it obviously correct:
74 */
75 max_long = 0;
76 for (i = 0; i < IO_BITMAP_LONGS; i++)
77 if (t->io_bitmap_ptr[i] != ~0UL)
78 max_long = i;
79
80 bytes = (max_long + 1) * sizeof(long);
81 bytes_updated = max(bytes, t->io_bitmap_max);
82
83 t->io_bitmap_max = bytes;
84
85 /* Update the TSS: */
86 memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
87
88 put_cpu();
89
90 return 0;
91}
92
93/*
94 * sys_iopl has to be used when you want to access the IO ports
95 * beyond the 0x3ff range: to get the full 65536 ports bitmapped
96 * you'd need 8kB of bitmaps/process, which is a bit excessive.
97 *
98 * Here we just change the eflags value on the stack: we allow
99 * only the super-user to do it. This depends on the stack-layout
100 * on system-call entry - see also fork() and the signal handling
101 * code.
102 */
103
104asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
105{
106 unsigned int old = (regs->eflags >> 12) & 3;
107
108 if (level > 3)
109 return -EINVAL;
110 /* Trying to gain more privileges? */
111 if (level > old) {
112 if (!capable(CAP_SYS_RAWIO))
113 return -EPERM;
114 }
115 regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12);
116 return 0;
117}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index d3fde94f7345..cef054b09d27 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
66 * SMP cross-CPU interrupts have their own specific 66 * SMP cross-CPU interrupts have their own specific
67 * handlers). 67 * handlers).
68 */ 68 */
69fastcall unsigned int do_IRQ(struct pt_regs *regs) 69unsigned int do_IRQ(struct pt_regs *regs)
70{ 70{
71 struct pt_regs *old_regs; 71 struct pt_regs *old_regs;
72 /* high bit used in ret_from_ code */ 72 /* high bit used in ret_from_ code */
73 int irq = ~regs->orig_eax; 73 int irq = ~regs->orig_ax;
74 struct irq_desc *desc = irq_desc + irq; 74 struct irq_desc *desc = irq_desc + irq;
75#ifdef CONFIG_4KSTACKS 75#ifdef CONFIG_4KSTACKS
76 union irq_ctx *curctx, *irqctx; 76 union irq_ctx *curctx, *irqctx;
@@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
88#ifdef CONFIG_DEBUG_STACKOVERFLOW 88#ifdef CONFIG_DEBUG_STACKOVERFLOW
89 /* Debugging check for stack overflow: is there less than 1KB free? */ 89 /* Debugging check for stack overflow: is there less than 1KB free? */
90 { 90 {
91 long esp; 91 long sp;
92 92
93 __asm__ __volatile__("andl %%esp,%0" : 93 __asm__ __volatile__("andl %%esp,%0" :
94 "=r" (esp) : "0" (THREAD_SIZE - 1)); 94 "=r" (sp) : "0" (THREAD_SIZE - 1));
95 if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { 95 if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
96 printk("do_IRQ: stack overflow: %ld\n", 96 printk("do_IRQ: stack overflow: %ld\n",
97 esp - sizeof(struct thread_info)); 97 sp - sizeof(struct thread_info));
98 dump_stack(); 98 dump_stack();
99 } 99 }
100 } 100 }
@@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
112 * current stack (which is the irq stack already after all) 112 * current stack (which is the irq stack already after all)
113 */ 113 */
114 if (curctx != irqctx) { 114 if (curctx != irqctx) {
115 int arg1, arg2, ebx; 115 int arg1, arg2, bx;
116 116
117 /* build the stack frame on the IRQ stack */ 117 /* build the stack frame on the IRQ stack */
118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); 118 isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
@@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK); 128 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
129 129
130 asm volatile( 130 asm volatile(
131 " xchgl %%ebx,%%esp \n" 131 " xchgl %%ebx,%%esp \n"
132 " call *%%edi \n" 132 " call *%%edi \n"
133 " movl %%ebx,%%esp \n" 133 " movl %%ebx,%%esp \n"
134 : "=a" (arg1), "=d" (arg2), "=b" (ebx) 134 : "=a" (arg1), "=d" (arg2), "=b" (bx)
135 : "0" (irq), "1" (desc), "2" (isp), 135 : "0" (irq), "1" (desc), "2" (isp),
136 "D" (desc->handle_irq) 136 "D" (desc->handle_irq)
137 : "memory", "cc" 137 : "memory", "cc"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 6b5c730d67b9..3aac15466a91 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,26 @@
20 20
21atomic_t irq_err_count; 21atomic_t irq_err_count;
22 22
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
30 /*
31 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N
33 * irq slots per priority level, and a 'hanging, unacked' IRQ
34 * holds up an irq slot - in excessive cases (when multiple
35 * unexpected vectors occur) that might lock up the APIC
36 * completely.
37 * But don't ack when the APIC is disabled. -AK
38 */
39 if (!disable_apic)
40 ack_APIC_irq();
41}
42
23#ifdef CONFIG_DEBUG_STACKOVERFLOW 43#ifdef CONFIG_DEBUG_STACKOVERFLOW
24/* 44/*
25 * Probabilistic stack overflow check: 45 * Probabilistic stack overflow check:
@@ -33,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs)
33 u64 curbase = (u64)task_stack_page(current); 53 u64 curbase = (u64)task_stack_page(current);
34 static unsigned long warned = -60*HZ; 54 static unsigned long warned = -60*HZ;
35 55
36 if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && 56 if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
37 regs->rsp < curbase + sizeof(struct thread_info) + 128 && 57 regs->sp < curbase + sizeof(struct thread_info) + 128 &&
38 time_after(jiffies, warned + 60*HZ)) { 58 time_after(jiffies, warned + 60*HZ)) {
39 printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", 59 printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
40 current->comm, curbase, regs->rsp); 60 current->comm, curbase, regs->sp);
41 show_stack(NULL,NULL); 61 show_stack(NULL,NULL);
42 warned = jiffies; 62 warned = jiffies;
43 } 63 }
@@ -142,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
142 struct pt_regs *old_regs = set_irq_regs(regs); 162 struct pt_regs *old_regs = set_irq_regs(regs);
143 163
144 /* high bit used in ret_from_ code */ 164 /* high bit used in ret_from_ code */
145 unsigned vector = ~regs->orig_rax; 165 unsigned vector = ~regs->orig_ax;
146 unsigned irq; 166 unsigned irq;
147 167
148 exit_idle(); 168 exit_idle();
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
new file mode 100644
index 000000000000..73354302fda7
--- /dev/null
+++ b/arch/x86/kernel/kdebugfs.c
@@ -0,0 +1,65 @@
1/*
2 * Architecture specific debugfs files
3 *
4 * Copyright (C) 2007, Intel Corp.
5 * Huang Ying <ying.huang@intel.com>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/debugfs.h>
11#include <linux/stat.h>
12#include <linux/init.h>
13
14#include <asm/setup.h>
15
16#ifdef CONFIG_DEBUG_BOOT_PARAMS
17static struct debugfs_blob_wrapper boot_params_blob = {
18 .data = &boot_params,
19 .size = sizeof(boot_params),
20};
21
22static int __init boot_params_kdebugfs_init(void)
23{
24 int error;
25 struct dentry *dbp, *version, *data;
26
27 dbp = debugfs_create_dir("boot_params", NULL);
28 if (!dbp) {
29 error = -ENOMEM;
30 goto err_return;
31 }
32 version = debugfs_create_x16("version", S_IRUGO, dbp,
33 &boot_params.hdr.version);
34 if (!version) {
35 error = -ENOMEM;
36 goto err_dir;
37 }
38 data = debugfs_create_blob("data", S_IRUGO, dbp,
39 &boot_params_blob);
40 if (!data) {
41 error = -ENOMEM;
42 goto err_version;
43 }
44 return 0;
45err_version:
46 debugfs_remove(version);
47err_dir:
48 debugfs_remove(dbp);
49err_return:
50 return error;
51}
52#endif
53
54static int __init arch_kdebugfs_init(void)
55{
56 int error = 0;
57
58#ifdef CONFIG_DEBUG_BOOT_PARAMS
59 error = boot_params_kdebugfs_init();
60#endif
61
62 return error;
63}
64
65arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
new file mode 100644
index 000000000000..a99e764fd66a
--- /dev/null
+++ b/arch/x86/kernel/kprobes.c
@@ -0,0 +1,1066 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
26 * <prasanna@in.ibm.com> adapted for x86_64 from i386.
27 * 2005-Mar Roland McGrath <roland@redhat.com>
28 * Fixed to handle %rip-relative addressing mode correctly.
29 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
31 * <prasanna@in.ibm.com> added function-return probes.
32 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
33 * Added function return probes functionality
34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
35 * kprobe-booster and kretprobe-booster for i386.
36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
37 * and kretprobe-booster for x86-64
38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
40 * unified x86 kprobes code.
41 */
42
43#include <linux/kprobes.h>
44#include <linux/ptrace.h>
45#include <linux/string.h>
46#include <linux/slab.h>
47#include <linux/hardirq.h>
48#include <linux/preempt.h>
49#include <linux/module.h>
50#include <linux/kdebug.h>
51
52#include <asm/cacheflush.h>
53#include <asm/desc.h>
54#include <asm/pgtable.h>
55#include <asm/uaccess.h>
56#include <asm/alternative.h>
57
58void jprobe_return_end(void);
59
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62
63#ifdef CONFIG_X86_64
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
79 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
80 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
81 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
82 << (row % 32))
83 /*
84 * Undefined/reserved opcodes, conditional jump, Opcode Extension
85 * Groups, and some special opcodes can not boost.
86 */
87static const u32 twobyte_is_boostable[256 / 32] = {
88 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
89 /* ---------------------------------------------- */
90 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
91 W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
92 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
93 W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
94 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
95 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
96 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
97 W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
98 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
99 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
100 W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
101 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
102 W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
103 W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
104 W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
105 W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
106 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W
154
155struct kretprobe_blackpoint kretprobe_blacklist[] = {
156 {"__switch_to", }, /* This function switches only current task, but
157 doesn't switch kernel stack.*/
158 {NULL, NULL} /* Terminator */
159};
160const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
161
162/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
163static void __kprobes set_jmp_op(void *from, void *to)
164{
165 struct __arch_jmp_op {
166 char op;
167 s32 raddr;
168 } __attribute__((packed)) * jop;
169 jop = (struct __arch_jmp_op *)from;
170 jop->raddr = (s32)((long)(to) - ((long)(from) + 5));
171 jop->op = RELATIVEJUMP_INSTRUCTION;
172}
173
174/*
175 * Check for the REX prefix which can only exist on X86_64
176 * X86_32 always returns 0
177 */
178static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
179{
180#ifdef CONFIG_X86_64
181 if ((*insn & 0xf0) == 0x40)
182 return 1;
183#endif
184 return 0;
185}
186
187/*
188 * Returns non-zero if opcode is boostable.
189 * RIP relative instructions are adjusted at copying time in 64 bits mode
190 */
191static int __kprobes can_boost(kprobe_opcode_t *opcodes)
192{
193 kprobe_opcode_t opcode;
194 kprobe_opcode_t *orig_opcodes = opcodes;
195
196retry:
197 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
198 return 0;
199 opcode = *(opcodes++);
200
201 /* 2nd-byte opcode */
202 if (opcode == 0x0f) {
203 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
204 return 0;
205 return test_bit(*opcodes,
206 (unsigned long *)twobyte_is_boostable);
207 }
208
209 switch (opcode & 0xf0) {
210#ifdef CONFIG_X86_64
211 case 0x40:
212 goto retry; /* REX prefix is boostable */
213#endif
214 case 0x60:
215 if (0x63 < opcode && opcode < 0x67)
216 goto retry; /* prefixes */
217 /* can't boost Address-size override and bound */
218 return (opcode != 0x62 && opcode != 0x67);
219 case 0x70:
220 return 0; /* can't boost conditional jump */
221 case 0xc0:
222 /* can't boost software-interruptions */
223 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
224 case 0xd0:
225 /* can boost AA* and XLAT */
226 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
227 case 0xe0:
228 /* can boost in/out and absolute jmps */
229 return ((opcode & 0x04) || opcode == 0xea);
230 case 0xf0:
231 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
232 goto retry; /* lock/rep(ne) prefix */
233 /* clear and set flags are boostable */
234 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
235 default:
236 /* segment override prefixes are boostable */
237 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
238 goto retry; /* prefixes */
239 /* CS override prefix and call are not boostable */
240 return (opcode != 0x2e && opcode != 0x9a);
241 }
242}
243
244/*
245 * Returns non-zero if opcode modifies the interrupt flag.
246 */
247static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
248{
249 switch (*insn) {
250 case 0xfa: /* cli */
251 case 0xfb: /* sti */
252 case 0xcf: /* iret/iretd */
253 case 0x9d: /* popf/popfd */
254 return 1;
255 }
256
257 /*
258 * on X86_64, 0x40-0x4f are REX prefixes so we need to look
259 * at the next byte instead.. but of course not recurse infinitely
260 */
261 if (is_REX_prefix(insn))
262 return is_IF_modifier(++insn);
263
264 return 0;
265}
266
267/*
268 * Adjust the displacement if the instruction uses the %rip-relative
269 * addressing mode.
270 * If it does, Return the address of the 32-bit displacement word.
271 * If not, return null.
272 * Only applicable to 64-bit x86.
273 */
274static void __kprobes fix_riprel(struct kprobe *p)
275{
276#ifdef CONFIG_X86_64
277 u8 *insn = p->ainsn.insn;
278 s64 disp;
279 int need_modrm;
280
281 /* Skip legacy instruction prefixes. */
282 while (1) {
283 switch (*insn) {
284 case 0x66:
285 case 0x67:
286 case 0x2e:
287 case 0x3e:
288 case 0x26:
289 case 0x64:
290 case 0x65:
291 case 0x36:
292 case 0xf0:
293 case 0xf3:
294 case 0xf2:
295 ++insn;
296 continue;
297 }
298 break;
299 }
300
301 /* Skip REX instruction prefix. */
302 if (is_REX_prefix(insn))
303 ++insn;
304
305 if (*insn == 0x0f) {
306 /* Two-byte opcode. */
307 ++insn;
308 need_modrm = test_bit(*insn,
309 (unsigned long *)twobyte_has_modrm);
310 } else
311 /* One-byte opcode. */
312 need_modrm = test_bit(*insn,
313 (unsigned long *)onebyte_has_modrm);
314
315 if (need_modrm) {
316 u8 modrm = *++insn;
317 if ((modrm & 0xc7) == 0x05) {
318 /* %rip+disp32 addressing mode */
319 /* Displacement follows ModRM byte. */
320 ++insn;
321 /*
322 * The copied instruction uses the %rip-relative
323 * addressing mode. Adjust the displacement for the
324 * difference between the original location of this
325 * instruction and the location of the copy that will
326 * actually be run. The tricky bit here is making sure
327 * that the sign extension happens correctly in this
328 * calculation, since we need a signed 32-bit result to
329 * be sign-extended to 64 bits when it's added to the
330 * %rip value and yield the same 64-bit result that the
331 * sign-extension of the original signed 32-bit
332 * displacement would have given.
333 */
334 disp = (u8 *) p->addr + *((s32 *) insn) -
335 (u8 *) p->ainsn.insn;
336 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
337 *(s32 *)insn = (s32) disp;
338 }
339 }
340#endif
341}
342
343static void __kprobes arch_copy_kprobe(struct kprobe *p)
344{
345 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
346
347 fix_riprel(p);
348
349 if (can_boost(p->addr))
350 p->ainsn.boostable = 0;
351 else
352 p->ainsn.boostable = -1;
353
354 p->opcode = *p->addr;
355}
356
357int __kprobes arch_prepare_kprobe(struct kprobe *p)
358{
359 /* insn: must be on special executable page on x86. */
360 p->ainsn.insn = get_insn_slot();
361 if (!p->ainsn.insn)
362 return -ENOMEM;
363 arch_copy_kprobe(p);
364 return 0;
365}
366
367void __kprobes arch_arm_kprobe(struct kprobe *p)
368{
369 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
370}
371
372void __kprobes arch_disarm_kprobe(struct kprobe *p)
373{
374 text_poke(p->addr, &p->opcode, 1);
375}
376
377void __kprobes arch_remove_kprobe(struct kprobe *p)
378{
379 mutex_lock(&kprobe_mutex);
380 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
381 mutex_unlock(&kprobe_mutex);
382}
383
384static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
385{
386 kcb->prev_kprobe.kp = kprobe_running();
387 kcb->prev_kprobe.status = kcb->kprobe_status;
388 kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
389 kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
390}
391
392static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
393{
394 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
395 kcb->kprobe_status = kcb->prev_kprobe.status;
396 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
397 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
398}
399
400static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
401 struct kprobe_ctlblk *kcb)
402{
403 __get_cpu_var(current_kprobe) = p;
404 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
405 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
406 if (is_IF_modifier(p->ainsn.insn))
407 kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
408}
409
410static void __kprobes clear_btf(void)
411{
412 if (test_thread_flag(TIF_DEBUGCTLMSR))
413 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
414}
415
416static void __kprobes restore_btf(void)
417{
418 if (test_thread_flag(TIF_DEBUGCTLMSR))
419 wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
420}
421
422static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
423{
424 clear_btf();
425 regs->flags |= X86_EFLAGS_TF;
426 regs->flags &= ~X86_EFLAGS_IF;
427 /* single step inline if the instruction is an int3 */
428 if (p->opcode == BREAKPOINT_INSTRUCTION)
429 regs->ip = (unsigned long)p->addr;
430 else
431 regs->ip = (unsigned long)p->ainsn.insn;
432}
433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs)
437{
438 unsigned long *sara = stack_addr(regs);
439
440 ri->ret_addr = (kprobe_opcode_t *) *sara;
441
442 /* Replace the return addr with trampoline addr */
443 *sara = (unsigned long) &kretprobe_trampoline;
444}
445
446static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
447 struct kprobe_ctlblk *kcb)
448{
449#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
450 if (p->ainsn.boostable == 1 && !p->post_handler) {
451 /* Boost up -- we can execute copied instructions directly */
452 reset_current_kprobe();
453 regs->ip = (unsigned long)p->ainsn.insn;
454 preempt_enable_no_resched();
455 return;
456 }
457#endif
458 prepare_singlestep(p, regs);
459 kcb->kprobe_status = KPROBE_HIT_SS;
460}
461
462/*
463 * We have reentered the kprobe_handler(), since another probe was hit while
464 * within the handler. We save the original kprobes variables and just single
465 * step on the instruction of the new probe without calling any user handlers.
466 */
467static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
468 struct kprobe_ctlblk *kcb)
469{
470 switch (kcb->kprobe_status) {
471 case KPROBE_HIT_SSDONE:
472#ifdef CONFIG_X86_64
473 /* TODO: Provide re-entrancy from post_kprobes_handler() and
474 * avoid exception stack corruption while single-stepping on
475 * the instruction of the new probe.
476 */
477 arch_disarm_kprobe(p);
478 regs->ip = (unsigned long)p->addr;
479 reset_current_kprobe();
480 preempt_enable_no_resched();
481 break;
482#endif
483 case KPROBE_HIT_ACTIVE:
484 save_previous_kprobe(kcb);
485 set_current_kprobe(p, regs, kcb);
486 kprobes_inc_nmissed_count(p);
487 prepare_singlestep(p, regs);
488 kcb->kprobe_status = KPROBE_REENTER;
489 break;
490 case KPROBE_HIT_SS:
491 if (p == kprobe_running()) {
492 regs->flags &= ~TF_MASK;
493 regs->flags |= kcb->kprobe_saved_flags;
494 return 0;
495 } else {
496 /* A probe has been hit in the codepath leading up
497 * to, or just after, single-stepping of a probed
498 * instruction. This entire codepath should strictly
499 * reside in .kprobes.text section. Raise a warning
500 * to highlight this peculiar case.
501 */
502 }
503 default:
504 /* impossible cases */
505 WARN_ON(1);
506 return 0;
507 }
508
509 return 1;
510}
511
512/*
513 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
514 * remain disabled thorough out this function.
515 */
516static int __kprobes kprobe_handler(struct pt_regs *regs)
517{
518 kprobe_opcode_t *addr;
519 struct kprobe *p;
520 struct kprobe_ctlblk *kcb;
521
522 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
523 if (*addr != BREAKPOINT_INSTRUCTION) {
524 /*
525 * The breakpoint instruction was removed right
526 * after we hit it. Another cpu has removed
527 * either a probepoint or a debugger breakpoint
528 * at this address. In either case, no further
529 * handling of this interrupt is appropriate.
530 * Back up over the (now missing) int3 and run
531 * the original instruction.
532 */
533 regs->ip = (unsigned long)addr;
534 return 1;
535 }
536
537 /*
538 * We don't want to be preempted for the entire
539 * duration of kprobe processing. We conditionally
540 * re-enable preemption at the end of this function,
541 * and also in reenter_kprobe() and setup_singlestep().
542 */
543 preempt_disable();
544
545 kcb = get_kprobe_ctlblk();
546 p = get_kprobe(addr);
547
548 if (p) {
549 if (kprobe_running()) {
550 if (reenter_kprobe(p, regs, kcb))
551 return 1;
552 } else {
553 set_current_kprobe(p, regs, kcb);
554 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
555
556 /*
557 * If we have no pre-handler or it returned 0, we
558 * continue with normal processing. If we have a
559 * pre-handler and it returned non-zero, it prepped
560 * for calling the break_handler below on re-entry
561 * for jprobe processing, so get out doing nothing
562 * more here.
563 */
564 if (!p->pre_handler || !p->pre_handler(p, regs))
565 setup_singlestep(p, regs, kcb);
566 return 1;
567 }
568 } else if (kprobe_running()) {
569 p = __get_cpu_var(current_kprobe);
570 if (p->break_handler && p->break_handler(p, regs)) {
571 setup_singlestep(p, regs, kcb);
572 return 1;
573 }
574 } /* else: not a kprobe fault; let the kernel handle it */
575
576 preempt_enable_no_resched();
577 return 0;
578}
579
580/*
581 * When a retprobed function returns, this code saves registers and
582 * calls trampoline_handler() runs, which calls the kretprobe's handler.
583 */
584void __kprobes kretprobe_trampoline_holder(void)
585{
586 asm volatile (
587 ".global kretprobe_trampoline\n"
588 "kretprobe_trampoline: \n"
589#ifdef CONFIG_X86_64
590 /* We don't bother saving the ss register */
591 " pushq %rsp\n"
592 " pushfq\n"
593 /*
594 * Skip cs, ip, orig_ax.
595 * trampoline_handler() will plug in these values
596 */
597 " subq $24, %rsp\n"
598 " pushq %rdi\n"
599 " pushq %rsi\n"
600 " pushq %rdx\n"
601 " pushq %rcx\n"
602 " pushq %rax\n"
603 " pushq %r8\n"
604 " pushq %r9\n"
605 " pushq %r10\n"
606 " pushq %r11\n"
607 " pushq %rbx\n"
608 " pushq %rbp\n"
609 " pushq %r12\n"
610 " pushq %r13\n"
611 " pushq %r14\n"
612 " pushq %r15\n"
613 " movq %rsp, %rdi\n"
614 " call trampoline_handler\n"
615 /* Replace saved sp with true return address. */
616 " movq %rax, 152(%rsp)\n"
617 " popq %r15\n"
618 " popq %r14\n"
619 " popq %r13\n"
620 " popq %r12\n"
621 " popq %rbp\n"
622 " popq %rbx\n"
623 " popq %r11\n"
624 " popq %r10\n"
625 " popq %r9\n"
626 " popq %r8\n"
627 " popq %rax\n"
628 " popq %rcx\n"
629 " popq %rdx\n"
630 " popq %rsi\n"
631 " popq %rdi\n"
632 /* Skip orig_ax, ip, cs */
633 " addq $24, %rsp\n"
634 " popfq\n"
635#else
636 " pushf\n"
637 /*
638 * Skip cs, ip, orig_ax.
639 * trampoline_handler() will plug in these values
640 */
641 " subl $12, %esp\n"
642 " pushl %fs\n"
643 " pushl %ds\n"
644 " pushl %es\n"
645 " pushl %eax\n"
646 " pushl %ebp\n"
647 " pushl %edi\n"
648 " pushl %esi\n"
649 " pushl %edx\n"
650 " pushl %ecx\n"
651 " pushl %ebx\n"
652 " movl %esp, %eax\n"
653 " call trampoline_handler\n"
654 /* Move flags to cs */
655 " movl 52(%esp), %edx\n"
656 " movl %edx, 48(%esp)\n"
657 /* Replace saved flags with true return address. */
658 " movl %eax, 52(%esp)\n"
659 " popl %ebx\n"
660 " popl %ecx\n"
661 " popl %edx\n"
662 " popl %esi\n"
663 " popl %edi\n"
664 " popl %ebp\n"
665 " popl %eax\n"
666 /* Skip ip, orig_ax, es, ds, fs */
667 " addl $20, %esp\n"
668 " popf\n"
669#endif
670 " ret\n");
671}
672
673/*
674 * Called from kretprobe_trampoline
675 */
676void * __kprobes trampoline_handler(struct pt_regs *regs)
677{
678 struct kretprobe_instance *ri = NULL;
679 struct hlist_head *head, empty_rp;
680 struct hlist_node *node, *tmp;
681 unsigned long flags, orig_ret_address = 0;
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683
684 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */
688#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS;
690#else
691 regs->cs = __KERNEL_CS | get_kernel_rpl();
692#endif
693 regs->ip = trampoline_address;
694 regs->orig_ax = ~0UL;
695
696 /*
697 * It is possible to have multiple instances associated with a given
698 * task either because multiple functions in the call path have
699 * return probes installed on them, and/or more then one
700 * return probe was registered for a target function.
701 *
702 * We can handle this because:
703 * - instances are always pushed into the head of the list
704 * - when multiple return probes are registered for the same
705 * function, the (chronologically) first instance's ret_addr
706 * will be the real return address, and all the rest will
707 * point to kretprobe_trampoline.
708 */
709 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
710 if (ri->task != current)
711 /* another task is sharing our hash bucket */
712 continue;
713
714 if (ri->rp && ri->rp->handler) {
715 __get_cpu_var(current_kprobe) = &ri->rp->kp;
716 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
717 ri->rp->handler(ri, regs);
718 __get_cpu_var(current_kprobe) = NULL;
719 }
720
721 orig_ret_address = (unsigned long)ri->ret_addr;
722 recycle_rp_inst(ri, &empty_rp);
723
724 if (orig_ret_address != trampoline_address)
725 /*
726 * This is the real return address. Any other
727 * instances associated with this task are for
728 * other calls deeper on the call stack
729 */
730 break;
731 }
732
733 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734
735 spin_unlock_irqrestore(&kretprobe_lock, flags);
736
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist);
739 kfree(ri);
740 }
741 return (void *)orig_ret_address;
742}
743
744/*
745 * Called after single-stepping. p->addr is the address of the
746 * instruction whose first byte has been replaced by the "int 3"
747 * instruction. To avoid the SMP problems that can occur when we
748 * temporarily put back the original opcode to single-step, we
749 * single-stepped a copy of the instruction. The address of this
750 * copy is p->ainsn.insn.
751 *
752 * This function prepares to return from the post-single-step
753 * interrupt. We have to fix up the stack as follows:
754 *
755 * 0) Except in the case of absolute or indirect jump or call instructions,
756 * the new ip is relative to the copied instruction. We need to make
757 * it relative to the original instruction.
758 *
759 * 1) If the single-stepped instruction was pushfl, then the TF and IF
760 * flags are set in the just-pushed flags, and may need to be cleared.
761 *
762 * 2) If the single-stepped instruction was a call, the return address
763 * that is atop the stack is the address following the copied instruction.
764 * We need to make it the address following the original instruction.
765 *
766 * If this is the first time we've single-stepped the instruction at
767 * this probepoint, and the instruction is boostable, boost it: add a
768 * jump instruction after the copied instruction, that jumps to the next
769 * instruction after the probepoint.
770 */
771static void __kprobes resume_execution(struct kprobe *p,
772 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
773{
774 unsigned long *tos = stack_addr(regs);
775 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
776 unsigned long orig_ip = (unsigned long)p->addr;
777 kprobe_opcode_t *insn = p->ainsn.insn;
778
779 /*skip the REX prefix*/
780 if (is_REX_prefix(insn))
781 insn++;
782
783 regs->flags &= ~X86_EFLAGS_TF;
784 switch (*insn) {
785 case 0x9c: /* pushfl */
786 *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
787 *tos |= kcb->kprobe_old_flags;
788 break;
789 case 0xc2: /* iret/ret/lret */
790 case 0xc3:
791 case 0xca:
792 case 0xcb:
793 case 0xcf:
794 case 0xea: /* jmp absolute -- ip is correct */
795 /* ip is already adjusted, no more changes required */
796 p->ainsn.boostable = 1;
797 goto no_change;
798 case 0xe8: /* call relative - Fix return addr */
799 *tos = orig_ip + (*tos - copy_ip);
800 break;
801#ifdef CONFIG_X86_32
802 case 0x9a: /* call absolute -- same as call absolute, indirect */
803 *tos = orig_ip + (*tos - copy_ip);
804 goto no_change;
805#endif
806 case 0xff:
807 if ((insn[1] & 0x30) == 0x10) {
808 /*
809 * call absolute, indirect
810 * Fix return addr; ip is correct.
811 * But this is not boostable
812 */
813 *tos = orig_ip + (*tos - copy_ip);
814 goto no_change;
815 } else if (((insn[1] & 0x31) == 0x20) ||
816 ((insn[1] & 0x31) == 0x21)) {
817 /*
818 * jmp near and far, absolute indirect
819 * ip is correct. And this is boostable
820 */
821 p->ainsn.boostable = 1;
822 goto no_change;
823 }
824 default:
825 break;
826 }
827
828 if (p->ainsn.boostable == 0) {
829 if ((regs->ip > copy_ip) &&
830 (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) {
831 /*
832 * These instructions can be executed directly if it
833 * jumps back to correct address.
834 */
835 set_jmp_op((void *)regs->ip,
836 (void *)orig_ip + (regs->ip - copy_ip));
837 p->ainsn.boostable = 1;
838 } else {
839 p->ainsn.boostable = -1;
840 }
841 }
842
843 regs->ip += orig_ip - copy_ip;
844
845no_change:
846 restore_btf();
847}
848
849/*
850 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
851 * remain disabled thoroughout this function.
852 */
853static int __kprobes post_kprobe_handler(struct pt_regs *regs)
854{
855 struct kprobe *cur = kprobe_running();
856 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
857
858 if (!cur)
859 return 0;
860
861 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
862 kcb->kprobe_status = KPROBE_HIT_SSDONE;
863 cur->post_handler(cur, regs, 0);
864 }
865
866 resume_execution(cur, regs, kcb);
867 regs->flags |= kcb->kprobe_saved_flags;
868 trace_hardirqs_fixup_flags(regs->flags);
869
870 /* Restore back the original saved kprobes variables and continue. */
871 if (kcb->kprobe_status == KPROBE_REENTER) {
872 restore_previous_kprobe(kcb);
873 goto out;
874 }
875 reset_current_kprobe();
876out:
877 preempt_enable_no_resched();
878
879 /*
880 * if somebody else is singlestepping across a probe point, flags
881 * will have TF set, in which case, continue the remaining processing
882 * of do_debug, as if this is not a probe hit.
883 */
884 if (regs->flags & X86_EFLAGS_TF)
885 return 0;
886
887 return 1;
888}
889
890int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
891{
892 struct kprobe *cur = kprobe_running();
893 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
894
895 switch (kcb->kprobe_status) {
896 case KPROBE_HIT_SS:
897 case KPROBE_REENTER:
898 /*
899 * We are here because the instruction being single
900 * stepped caused a page fault. We reset the current
901 * kprobe and the ip points back to the probe address
902 * and allow the page fault handler to continue as a
903 * normal page fault.
904 */
905 regs->ip = (unsigned long)cur->addr;
906 regs->flags |= kcb->kprobe_old_flags;
907 if (kcb->kprobe_status == KPROBE_REENTER)
908 restore_previous_kprobe(kcb);
909 else
910 reset_current_kprobe();
911 preempt_enable_no_resched();
912 break;
913 case KPROBE_HIT_ACTIVE:
914 case KPROBE_HIT_SSDONE:
915 /*
916 * We increment the nmissed count for accounting,
917 * we can also use npre/npostfault count for accounting
918 * these specific fault cases.
919 */
920 kprobes_inc_nmissed_count(cur);
921
922 /*
923 * We come here because instructions in the pre/post
924 * handler caused the page_fault, this could happen
925 * if handler tries to access user space by
926 * copy_from_user(), get_user() etc. Let the
927 * user-specified handler try to fix it first.
928 */
929 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
930 return 1;
931
932 /*
933 * In case the user-specified fault handler returned
934 * zero, try to fix up.
935 */
936 if (fixup_exception(regs))
937 return 1;
938
939 /*
940 * fixup routine could not handle it,
941 * Let do_page_fault() fix it.
942 */
943 break;
944 default:
945 break;
946 }
947 return 0;
948}
949
950/*
951 * Wrapper routine for handling exceptions.
952 */
953int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
954 unsigned long val, void *data)
955{
956 struct die_args *args = data;
957 int ret = NOTIFY_DONE;
958
959 if (args->regs && user_mode_vm(args->regs))
960 return ret;
961
962 switch (val) {
963 case DIE_INT3:
964 if (kprobe_handler(args->regs))
965 ret = NOTIFY_STOP;
966 break;
967 case DIE_DEBUG:
968 if (post_kprobe_handler(args->regs))
969 ret = NOTIFY_STOP;
970 break;
971 case DIE_GPF:
972 /*
973 * To be potentially processing a kprobe fault and to
974 * trust the result from kprobe_running(), we have
975 * be non-preemptible.
976 */
977 if (!preemptible() && kprobe_running() &&
978 kprobe_fault_handler(args->regs, args->trapnr))
979 ret = NOTIFY_STOP;
980 break;
981 default:
982 break;
983 }
984 return ret;
985}
986
987int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
988{
989 struct jprobe *jp = container_of(p, struct jprobe, kp);
990 unsigned long addr;
991 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
992
993 kcb->jprobe_saved_regs = *regs;
994 kcb->jprobe_saved_sp = stack_addr(regs);
995 addr = (unsigned long)(kcb->jprobe_saved_sp);
996
997 /*
998 * As Linus pointed out, gcc assumes that the callee
999 * owns the argument space and could overwrite it, e.g.
1000 * tailcall optimization. So, to be absolutely safe
1001 * we also save and restore enough stack bytes to cover
1002 * the argument area.
1003 */
1004 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
1005 MIN_STACK_SIZE(addr));
1006 regs->flags &= ~X86_EFLAGS_IF;
1007 trace_hardirqs_off();
1008 regs->ip = (unsigned long)(jp->entry);
1009 return 1;
1010}
1011
1012void __kprobes jprobe_return(void)
1013{
1014 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1015
1016 asm volatile (
1017#ifdef CONFIG_X86_64
1018 " xchg %%rbx,%%rsp \n"
1019#else
1020 " xchgl %%ebx,%%esp \n"
1021#endif
1022 " int3 \n"
1023 " .globl jprobe_return_end\n"
1024 " jprobe_return_end: \n"
1025 " nop \n"::"b"
1026 (kcb->jprobe_saved_sp):"memory");
1027}
1028
1029int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1030{
1031 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1032 u8 *addr = (u8 *) (regs->ip - 1);
1033 struct jprobe *jp = container_of(p, struct jprobe, kp);
1034
1035 if ((addr > (u8 *) jprobe_return) &&
1036 (addr < (u8 *) jprobe_return_end)) {
1037 if (stack_addr(regs) != kcb->jprobe_saved_sp) {
1038 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
1039 printk(KERN_ERR
1040 "current sp %p does not match saved sp %p\n",
1041 stack_addr(regs), kcb->jprobe_saved_sp);
1042 printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
1043 show_registers(saved_regs);
1044 printk(KERN_ERR "Current registers\n");
1045 show_registers(regs);
1046 BUG();
1047 }
1048 *regs = kcb->jprobe_saved_regs;
1049 memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
1050 kcb->jprobes_stack,
1051 MIN_STACK_SIZE(kcb->jprobe_saved_sp));
1052 preempt_enable_no_resched();
1053 return 1;
1054 }
1055 return 0;
1056}
1057
1058int __init arch_init_kprobes(void)
1059{
1060 return 0;
1061}
1062
1063int __kprobes arch_trampoline_kprobe(struct kprobe *p)
1064{
1065 return 0;
1066}
diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c
deleted file mode 100644
index 3a020f79f82b..000000000000
--- a/arch/x86/kernel/kprobes_32.c
+++ /dev/null
@@ -1,756 +0,0 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
26 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
27 * <prasanna@in.ibm.com> added function-return probes.
28 */
29
30#include <linux/kprobes.h>
31#include <linux/ptrace.h>
32#include <linux/preempt.h>
33#include <linux/kdebug.h>
34#include <asm/cacheflush.h>
35#include <asm/desc.h>
36#include <asm/uaccess.h>
37#include <asm/alternative.h>
38
39void jprobe_return_end(void);
40
41DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
42DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
43
44struct kretprobe_blackpoint kretprobe_blacklist[] = {
45 {"__switch_to", }, /* This function switches only current task, but
46 doesn't switch kernel stack.*/
47 {NULL, NULL} /* Terminator */
48};
49const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
50
51/* insert a jmp code */
52static __always_inline void set_jmp_op(void *from, void *to)
53{
54 struct __arch_jmp_op {
55 char op;
56 long raddr;
57 } __attribute__((packed)) *jop;
58 jop = (struct __arch_jmp_op *)from;
59 jop->raddr = (long)(to) - ((long)(from) + 5);
60 jop->op = RELATIVEJUMP_INSTRUCTION;
61}
62
63/*
64 * returns non-zero if opcodes can be boosted.
65 */
66static __always_inline int can_boost(kprobe_opcode_t *opcodes)
67{
68#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
69 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
70 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
71 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
72 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
73 << (row % 32))
74 /*
75 * Undefined/reserved opcodes, conditional jump, Opcode Extension
76 * Groups, and some special opcodes can not be boost.
77 */
78 static const unsigned long twobyte_is_boostable[256 / 32] = {
79 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
80 /* ------------------------------- */
81 W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
82 W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
83 W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
84 W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
85 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
86 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
87 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
88 W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
89 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
90 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
91 W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
92 W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
93 W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
94 W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
95 W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
96 W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
97 /* ------------------------------- */
98 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
99 };
100#undef W
101 kprobe_opcode_t opcode;
102 kprobe_opcode_t *orig_opcodes = opcodes;
103retry:
104 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
105 return 0;
106 opcode = *(opcodes++);
107
108 /* 2nd-byte opcode */
109 if (opcode == 0x0f) {
110 if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
111 return 0;
112 return test_bit(*opcodes, twobyte_is_boostable);
113 }
114
115 switch (opcode & 0xf0) {
116 case 0x60:
117 if (0x63 < opcode && opcode < 0x67)
118 goto retry; /* prefixes */
119 /* can't boost Address-size override and bound */
120 return (opcode != 0x62 && opcode != 0x67);
121 case 0x70:
122 return 0; /* can't boost conditional jump */
123 case 0xc0:
124 /* can't boost software-interruptions */
125 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
126 case 0xd0:
127 /* can boost AA* and XLAT */
128 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
129 case 0xe0:
130 /* can boost in/out and absolute jmps */
131 return ((opcode & 0x04) || opcode == 0xea);
132 case 0xf0:
133 if ((opcode & 0x0c) == 0 && opcode != 0xf1)
134 goto retry; /* lock/rep(ne) prefix */
135 /* clear and set flags can be boost */
136 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
137 default:
138 if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
139 goto retry; /* prefixes */
140 /* can't boost CS override and call */
141 return (opcode != 0x2e && opcode != 0x9a);
142 }
143}
144
145/*
146 * returns non-zero if opcode modifies the interrupt flag.
147 */
148static int __kprobes is_IF_modifier(kprobe_opcode_t opcode)
149{
150 switch (opcode) {
151 case 0xfa: /* cli */
152 case 0xfb: /* sti */
153 case 0xcf: /* iret/iretd */
154 case 0x9d: /* popf/popfd */
155 return 1;
156 }
157 return 0;
158}
159
160int __kprobes arch_prepare_kprobe(struct kprobe *p)
161{
162 /* insn: must be on special executable page on i386. */
163 p->ainsn.insn = get_insn_slot();
164 if (!p->ainsn.insn)
165 return -ENOMEM;
166
167 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
168 p->opcode = *p->addr;
169 if (can_boost(p->addr)) {
170 p->ainsn.boostable = 0;
171 } else {
172 p->ainsn.boostable = -1;
173 }
174 return 0;
175}
176
177void __kprobes arch_arm_kprobe(struct kprobe *p)
178{
179 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
180}
181
182void __kprobes arch_disarm_kprobe(struct kprobe *p)
183{
184 text_poke(p->addr, &p->opcode, 1);
185}
186
187void __kprobes arch_remove_kprobe(struct kprobe *p)
188{
189 mutex_lock(&kprobe_mutex);
190 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
191 mutex_unlock(&kprobe_mutex);
192}
193
194static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
195{
196 kcb->prev_kprobe.kp = kprobe_running();
197 kcb->prev_kprobe.status = kcb->kprobe_status;
198 kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags;
199 kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags;
200}
201
202static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
203{
204 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
205 kcb->kprobe_status = kcb->prev_kprobe.status;
206 kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags;
207 kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags;
208}
209
210static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
211 struct kprobe_ctlblk *kcb)
212{
213 __get_cpu_var(current_kprobe) = p;
214 kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags
215 = (regs->eflags & (TF_MASK | IF_MASK));
216 if (is_IF_modifier(p->opcode))
217 kcb->kprobe_saved_eflags &= ~IF_MASK;
218}
219
220static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
221{
222 regs->eflags |= TF_MASK;
223 regs->eflags &= ~IF_MASK;
224 /*single step inline if the instruction is an int3*/
225 if (p->opcode == BREAKPOINT_INSTRUCTION)
226 regs->eip = (unsigned long)p->addr;
227 else
228 regs->eip = (unsigned long)p->ainsn.insn;
229}
230
231/* Called with kretprobe_lock held */
232void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
233 struct pt_regs *regs)
234{
235 unsigned long *sara = (unsigned long *)&regs->esp;
236
237 ri->ret_addr = (kprobe_opcode_t *) *sara;
238
239 /* Replace the return addr with trampoline addr */
240 *sara = (unsigned long) &kretprobe_trampoline;
241}
242
243/*
244 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
245 * remain disabled thorough out this function.
246 */
247static int __kprobes kprobe_handler(struct pt_regs *regs)
248{
249 struct kprobe *p;
250 int ret = 0;
251 kprobe_opcode_t *addr;
252 struct kprobe_ctlblk *kcb;
253
254 addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
255
256 /*
257 * We don't want to be preempted for the entire
258 * duration of kprobe processing
259 */
260 preempt_disable();
261 kcb = get_kprobe_ctlblk();
262
263 /* Check we're not actually recursing */
264 if (kprobe_running()) {
265 p = get_kprobe(addr);
266 if (p) {
267 if (kcb->kprobe_status == KPROBE_HIT_SS &&
268 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
269 regs->eflags &= ~TF_MASK;
270 regs->eflags |= kcb->kprobe_saved_eflags;
271 goto no_kprobe;
272 }
273 /* We have reentered the kprobe_handler(), since
274 * another probe was hit while within the handler.
275 * We here save the original kprobes variables and
276 * just single step on the instruction of the new probe
277 * without calling any user handlers.
278 */
279 save_previous_kprobe(kcb);
280 set_current_kprobe(p, regs, kcb);
281 kprobes_inc_nmissed_count(p);
282 prepare_singlestep(p, regs);
283 kcb->kprobe_status = KPROBE_REENTER;
284 return 1;
285 } else {
286 if (*addr != BREAKPOINT_INSTRUCTION) {
287 /* The breakpoint instruction was removed by
288 * another cpu right after we hit, no further
289 * handling of this interrupt is appropriate
290 */
291 regs->eip -= sizeof(kprobe_opcode_t);
292 ret = 1;
293 goto no_kprobe;
294 }
295 p = __get_cpu_var(current_kprobe);
296 if (p->break_handler && p->break_handler(p, regs)) {
297 goto ss_probe;
298 }
299 }
300 goto no_kprobe;
301 }
302
303 p = get_kprobe(addr);
304 if (!p) {
305 if (*addr != BREAKPOINT_INSTRUCTION) {
306 /*
307 * The breakpoint instruction was removed right
308 * after we hit it. Another cpu has removed
309 * either a probepoint or a debugger breakpoint
310 * at this address. In either case, no further
311 * handling of this interrupt is appropriate.
312 * Back up over the (now missing) int3 and run
313 * the original instruction.
314 */
315 regs->eip -= sizeof(kprobe_opcode_t);
316 ret = 1;
317 }
318 /* Not one of ours: let kernel handle it */
319 goto no_kprobe;
320 }
321
322 set_current_kprobe(p, regs, kcb);
323 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
324
325 if (p->pre_handler && p->pre_handler(p, regs))
326 /* handler has already set things up, so skip ss setup */
327 return 1;
328
329ss_probe:
330#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
331 if (p->ainsn.boostable == 1 && !p->post_handler){
332 /* Boost up -- we can execute copied instructions directly */
333 reset_current_kprobe();
334 regs->eip = (unsigned long)p->ainsn.insn;
335 preempt_enable_no_resched();
336 return 1;
337 }
338#endif
339 prepare_singlestep(p, regs);
340 kcb->kprobe_status = KPROBE_HIT_SS;
341 return 1;
342
343no_kprobe:
344 preempt_enable_no_resched();
345 return ret;
346}
347
348/*
349 * For function-return probes, init_kprobes() establishes a probepoint
350 * here. When a retprobed function returns, this probe is hit and
351 * trampoline_probe_handler() runs, calling the kretprobe's handler.
352 */
353 void __kprobes kretprobe_trampoline_holder(void)
354 {
355 asm volatile ( ".global kretprobe_trampoline\n"
356 "kretprobe_trampoline: \n"
357 " pushf\n"
358 /* skip cs, eip, orig_eax */
359 " subl $12, %esp\n"
360 " pushl %fs\n"
361 " pushl %ds\n"
362 " pushl %es\n"
363 " pushl %eax\n"
364 " pushl %ebp\n"
365 " pushl %edi\n"
366 " pushl %esi\n"
367 " pushl %edx\n"
368 " pushl %ecx\n"
369 " pushl %ebx\n"
370 " movl %esp, %eax\n"
371 " call trampoline_handler\n"
372 /* move eflags to cs */
373 " movl 52(%esp), %edx\n"
374 " movl %edx, 48(%esp)\n"
375 /* save true return address on eflags */
376 " movl %eax, 52(%esp)\n"
377 " popl %ebx\n"
378 " popl %ecx\n"
379 " popl %edx\n"
380 " popl %esi\n"
381 " popl %edi\n"
382 " popl %ebp\n"
383 " popl %eax\n"
384 /* skip eip, orig_eax, es, ds, fs */
385 " addl $20, %esp\n"
386 " popf\n"
387 " ret\n");
388}
389
390/*
391 * Called from kretprobe_trampoline
392 */
393fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
394{
395 struct kretprobe_instance *ri = NULL;
396 struct hlist_head *head, empty_rp;
397 struct hlist_node *node, *tmp;
398 unsigned long flags, orig_ret_address = 0;
399 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
400
401 INIT_HLIST_HEAD(&empty_rp);
402 spin_lock_irqsave(&kretprobe_lock, flags);
403 head = kretprobe_inst_table_head(current);
404 /* fixup registers */
405 regs->xcs = __KERNEL_CS | get_kernel_rpl();
406 regs->eip = trampoline_address;
407 regs->orig_eax = 0xffffffff;
408
409 /*
410 * It is possible to have multiple instances associated with a given
411 * task either because an multiple functions in the call path
412 * have a return probe installed on them, and/or more then one return
413 * return probe was registered for a target function.
414 *
415 * We can handle this because:
416 * - instances are always inserted at the head of the list
417 * - when multiple return probes are registered for the same
418 * function, the first instance's ret_addr will point to the
419 * real return address, and all the rest will point to
420 * kretprobe_trampoline
421 */
422 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
423 if (ri->task != current)
424 /* another task is sharing our hash bucket */
425 continue;
426
427 if (ri->rp && ri->rp->handler){
428 __get_cpu_var(current_kprobe) = &ri->rp->kp;
429 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
430 ri->rp->handler(ri, regs);
431 __get_cpu_var(current_kprobe) = NULL;
432 }
433
434 orig_ret_address = (unsigned long)ri->ret_addr;
435 recycle_rp_inst(ri, &empty_rp);
436
437 if (orig_ret_address != trampoline_address)
438 /*
439 * This is the real return address. Any other
440 * instances associated with this task are for
441 * other calls deeper on the call stack
442 */
443 break;
444 }
445
446 kretprobe_assert(ri, orig_ret_address, trampoline_address);
447 spin_unlock_irqrestore(&kretprobe_lock, flags);
448
449 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
450 hlist_del(&ri->hlist);
451 kfree(ri);
452 }
453 return (void*)orig_ret_address;
454}
455
456/*
457 * Called after single-stepping. p->addr is the address of the
458 * instruction whose first byte has been replaced by the "int 3"
459 * instruction. To avoid the SMP problems that can occur when we
460 * temporarily put back the original opcode to single-step, we
461 * single-stepped a copy of the instruction. The address of this
462 * copy is p->ainsn.insn.
463 *
464 * This function prepares to return from the post-single-step
465 * interrupt. We have to fix up the stack as follows:
466 *
467 * 0) Except in the case of absolute or indirect jump or call instructions,
468 * the new eip is relative to the copied instruction. We need to make
469 * it relative to the original instruction.
470 *
471 * 1) If the single-stepped instruction was pushfl, then the TF and IF
472 * flags are set in the just-pushed eflags, and may need to be cleared.
473 *
474 * 2) If the single-stepped instruction was a call, the return address
475 * that is atop the stack is the address following the copied instruction.
476 * We need to make it the address following the original instruction.
477 *
478 * This function also checks instruction size for preparing direct execution.
479 */
480static void __kprobes resume_execution(struct kprobe *p,
481 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
482{
483 unsigned long *tos = (unsigned long *)&regs->esp;
484 unsigned long copy_eip = (unsigned long)p->ainsn.insn;
485 unsigned long orig_eip = (unsigned long)p->addr;
486
487 regs->eflags &= ~TF_MASK;
488 switch (p->ainsn.insn[0]) {
489 case 0x9c: /* pushfl */
490 *tos &= ~(TF_MASK | IF_MASK);
491 *tos |= kcb->kprobe_old_eflags;
492 break;
493 case 0xc2: /* iret/ret/lret */
494 case 0xc3:
495 case 0xca:
496 case 0xcb:
497 case 0xcf:
498 case 0xea: /* jmp absolute -- eip is correct */
499 /* eip is already adjusted, no more changes required */
500 p->ainsn.boostable = 1;
501 goto no_change;
502 case 0xe8: /* call relative - Fix return addr */
503 *tos = orig_eip + (*tos - copy_eip);
504 break;
505 case 0x9a: /* call absolute -- same as call absolute, indirect */
506 *tos = orig_eip + (*tos - copy_eip);
507 goto no_change;
508 case 0xff:
509 if ((p->ainsn.insn[1] & 0x30) == 0x10) {
510 /*
511 * call absolute, indirect
512 * Fix return addr; eip is correct.
513 * But this is not boostable
514 */
515 *tos = orig_eip + (*tos - copy_eip);
516 goto no_change;
517 } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
518 ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
519 /* eip is correct. And this is boostable */
520 p->ainsn.boostable = 1;
521 goto no_change;
522 }
523 default:
524 break;
525 }
526
527 if (p->ainsn.boostable == 0) {
528 if ((regs->eip > copy_eip) &&
529 (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) {
530 /*
531 * These instructions can be executed directly if it
532 * jumps back to correct address.
533 */
534 set_jmp_op((void *)regs->eip,
535 (void *)orig_eip + (regs->eip - copy_eip));
536 p->ainsn.boostable = 1;
537 } else {
538 p->ainsn.boostable = -1;
539 }
540 }
541
542 regs->eip = orig_eip + (regs->eip - copy_eip);
543
544no_change:
545 return;
546}
547
548/*
549 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
550 * remain disabled thoroughout this function.
551 */
552static int __kprobes post_kprobe_handler(struct pt_regs *regs)
553{
554 struct kprobe *cur = kprobe_running();
555 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
556
557 if (!cur)
558 return 0;
559
560 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
561 kcb->kprobe_status = KPROBE_HIT_SSDONE;
562 cur->post_handler(cur, regs, 0);
563 }
564
565 resume_execution(cur, regs, kcb);
566 regs->eflags |= kcb->kprobe_saved_eflags;
567 trace_hardirqs_fixup_flags(regs->eflags);
568
569 /*Restore back the original saved kprobes variables and continue. */
570 if (kcb->kprobe_status == KPROBE_REENTER) {
571 restore_previous_kprobe(kcb);
572 goto out;
573 }
574 reset_current_kprobe();
575out:
576 preempt_enable_no_resched();
577
578 /*
579 * if somebody else is singlestepping across a probe point, eflags
580 * will have TF set, in which case, continue the remaining processing
581 * of do_debug, as if this is not a probe hit.
582 */
583 if (regs->eflags & TF_MASK)
584 return 0;
585
586 return 1;
587}
588
589int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
590{
591 struct kprobe *cur = kprobe_running();
592 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
593
594 switch(kcb->kprobe_status) {
595 case KPROBE_HIT_SS:
596 case KPROBE_REENTER:
597 /*
598 * We are here because the instruction being single
599 * stepped caused a page fault. We reset the current
600 * kprobe and the eip points back to the probe address
601 * and allow the page fault handler to continue as a
602 * normal page fault.
603 */
604 regs->eip = (unsigned long)cur->addr;
605 regs->eflags |= kcb->kprobe_old_eflags;
606 if (kcb->kprobe_status == KPROBE_REENTER)
607 restore_previous_kprobe(kcb);
608 else
609 reset_current_kprobe();
610 preempt_enable_no_resched();
611 break;
612 case KPROBE_HIT_ACTIVE:
613 case KPROBE_HIT_SSDONE:
614 /*
615 * We increment the nmissed count for accounting,
616 * we can also use npre/npostfault count for accouting
617 * these specific fault cases.
618 */
619 kprobes_inc_nmissed_count(cur);
620
621 /*
622 * We come here because instructions in the pre/post
623 * handler caused the page_fault, this could happen
624 * if handler tries to access user space by
625 * copy_from_user(), get_user() etc. Let the
626 * user-specified handler try to fix it first.
627 */
628 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
629 return 1;
630
631 /*
632 * In case the user-specified fault handler returned
633 * zero, try to fix up.
634 */
635 if (fixup_exception(regs))
636 return 1;
637
638 /*
639 * fixup_exception() could not handle it,
640 * Let do_page_fault() fix it.
641 */
642 break;
643 default:
644 break;
645 }
646 return 0;
647}
648
649/*
650 * Wrapper routine to for handling exceptions.
651 */
652int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
653 unsigned long val, void *data)
654{
655 struct die_args *args = (struct die_args *)data;
656 int ret = NOTIFY_DONE;
657
658 if (args->regs && user_mode_vm(args->regs))
659 return ret;
660
661 switch (val) {
662 case DIE_INT3:
663 if (kprobe_handler(args->regs))
664 ret = NOTIFY_STOP;
665 break;
666 case DIE_DEBUG:
667 if (post_kprobe_handler(args->regs))
668 ret = NOTIFY_STOP;
669 break;
670 case DIE_GPF:
671 /* kprobe_running() needs smp_processor_id() */
672 preempt_disable();
673 if (kprobe_running() &&
674 kprobe_fault_handler(args->regs, args->trapnr))
675 ret = NOTIFY_STOP;
676 preempt_enable();
677 break;
678 default:
679 break;
680 }
681 return ret;
682}
683
684int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
685{
686 struct jprobe *jp = container_of(p, struct jprobe, kp);
687 unsigned long addr;
688 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
689
690 kcb->jprobe_saved_regs = *regs;
691 kcb->jprobe_saved_esp = &regs->esp;
692 addr = (unsigned long)(kcb->jprobe_saved_esp);
693
694 /*
695 * TBD: As Linus pointed out, gcc assumes that the callee
696 * owns the argument space and could overwrite it, e.g.
697 * tailcall optimization. So, to be absolutely safe
698 * we also save and restore enough stack bytes to cover
699 * the argument area.
700 */
701 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
702 MIN_STACK_SIZE(addr));
703 regs->eflags &= ~IF_MASK;
704 trace_hardirqs_off();
705 regs->eip = (unsigned long)(jp->entry);
706 return 1;
707}
708
709void __kprobes jprobe_return(void)
710{
711 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
712
713 asm volatile (" xchgl %%ebx,%%esp \n"
714 " int3 \n"
715 " .globl jprobe_return_end \n"
716 " jprobe_return_end: \n"
717 " nop \n"::"b"
718 (kcb->jprobe_saved_esp):"memory");
719}
720
721int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
722{
723 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
724 u8 *addr = (u8 *) (regs->eip - 1);
725 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp);
726 struct jprobe *jp = container_of(p, struct jprobe, kp);
727
728 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
729 if (&regs->esp != kcb->jprobe_saved_esp) {
730 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
731 printk("current esp %p does not match saved esp %p\n",
732 &regs->esp, kcb->jprobe_saved_esp);
733 printk("Saved registers for jprobe %p\n", jp);
734 show_registers(saved_regs);
735 printk("Current registers\n");
736 show_registers(regs);
737 BUG();
738 }
739 *regs = kcb->jprobe_saved_regs;
740 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
741 MIN_STACK_SIZE(stack_addr));
742 preempt_enable_no_resched();
743 return 1;
744 }
745 return 0;
746}
747
748int __kprobes arch_trampoline_kprobe(struct kprobe *p)
749{
750 return 0;
751}
752
753int __init arch_init_kprobes(void)
754{
755 return 0;
756}
diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c
deleted file mode 100644
index 5df19a9f9239..000000000000
--- a/arch/x86/kernel/kprobes_64.c
+++ /dev/null
@@ -1,749 +0,0 @@
1/*
2 * Kernel Probes (KProbes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 *
20 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
21 * Probes initial implementation ( includes contributions from
22 * Rusty Russell).
23 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
24 * interface to access function arguments.
25 * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
26 * <prasanna@in.ibm.com> adapted for x86_64
27 * 2005-Mar Roland McGrath <roland@redhat.com>
28 * Fixed to handle %rip-relative addressing mode correctly.
29 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
30 * Added function return probes functionality
31 */
32
33#include <linux/kprobes.h>
34#include <linux/ptrace.h>
35#include <linux/string.h>
36#include <linux/slab.h>
37#include <linux/preempt.h>
38#include <linux/module.h>
39#include <linux/kdebug.h>
40
41#include <asm/pgtable.h>
42#include <asm/uaccess.h>
43#include <asm/alternative.h>
44
45void jprobe_return_end(void);
46static void __kprobes arch_copy_kprobe(struct kprobe *p);
47
48DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
49DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
50
51struct kretprobe_blackpoint kretprobe_blacklist[] = {
52 {"__switch_to", }, /* This function switches only current task, but
53 doesn't switch kernel stack.*/
54 {NULL, NULL} /* Terminator */
55};
56const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
57
58/*
59 * returns non-zero if opcode modifies the interrupt flag.
60 */
61static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
62{
63 switch (*insn) {
64 case 0xfa: /* cli */
65 case 0xfb: /* sti */
66 case 0xcf: /* iret/iretd */
67 case 0x9d: /* popf/popfd */
68 return 1;
69 }
70
71 if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
72 return 1;
73 return 0;
74}
75
76int __kprobes arch_prepare_kprobe(struct kprobe *p)
77{
78 /* insn: must be on special executable page on x86_64. */
79 p->ainsn.insn = get_insn_slot();
80 if (!p->ainsn.insn) {
81 return -ENOMEM;
82 }
83 arch_copy_kprobe(p);
84 return 0;
85}
86
87/*
88 * Determine if the instruction uses the %rip-relative addressing mode.
89 * If it does, return the address of the 32-bit displacement word.
90 * If not, return null.
91 */
92static s32 __kprobes *is_riprel(u8 *insn)
93{
94#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
95 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
96 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
97 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
98 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
99 << (row % 64))
100 static const u64 onebyte_has_modrm[256 / 64] = {
101 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
102 /* ------------------------------- */
103 W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
104 W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
105 W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
106 W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
107 W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
108 W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
109 W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
110 W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
111 W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
112 W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
113 W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
114 W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
115 W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
116 W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
117 W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
118 W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
119 /* ------------------------------- */
120 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
121 };
122 static const u64 twobyte_has_modrm[256 / 64] = {
123 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
124 /* ------------------------------- */
125 W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
126 W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
127 W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
128 W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
129 W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
130 W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
131 W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
132 W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
133 W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
134 W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
135 W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
136 W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
137 W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
138 W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
139 W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
140 W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
141 /* ------------------------------- */
142 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
143 };
144#undef W
145 int need_modrm;
146
147 /* Skip legacy instruction prefixes. */
148 while (1) {
149 switch (*insn) {
150 case 0x66:
151 case 0x67:
152 case 0x2e:
153 case 0x3e:
154 case 0x26:
155 case 0x64:
156 case 0x65:
157 case 0x36:
158 case 0xf0:
159 case 0xf3:
160 case 0xf2:
161 ++insn;
162 continue;
163 }
164 break;
165 }
166
167 /* Skip REX instruction prefix. */
168 if ((*insn & 0xf0) == 0x40)
169 ++insn;
170
171 if (*insn == 0x0f) { /* Two-byte opcode. */
172 ++insn;
173 need_modrm = test_bit(*insn, twobyte_has_modrm);
174 } else { /* One-byte opcode. */
175 need_modrm = test_bit(*insn, onebyte_has_modrm);
176 }
177
178 if (need_modrm) {
179 u8 modrm = *++insn;
180 if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
181 /* Displacement follows ModRM byte. */
182 return (s32 *) ++insn;
183 }
184 }
185
186 /* No %rip-relative addressing mode here. */
187 return NULL;
188}
189
190static void __kprobes arch_copy_kprobe(struct kprobe *p)
191{
192 s32 *ripdisp;
193 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
194 ripdisp = is_riprel(p->ainsn.insn);
195 if (ripdisp) {
196 /*
197 * The copied instruction uses the %rip-relative
198 * addressing mode. Adjust the displacement for the
199 * difference between the original location of this
200 * instruction and the location of the copy that will
201 * actually be run. The tricky bit here is making sure
202 * that the sign extension happens correctly in this
203 * calculation, since we need a signed 32-bit result to
204 * be sign-extended to 64 bits when it's added to the
205 * %rip value and yield the same 64-bit result that the
206 * sign-extension of the original signed 32-bit
207 * displacement would have given.
208 */
209 s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
210 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
211 *ripdisp = disp;
212 }
213 p->opcode = *p->addr;
214}
215
216void __kprobes arch_arm_kprobe(struct kprobe *p)
217{
218 text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
219}
220
221void __kprobes arch_disarm_kprobe(struct kprobe *p)
222{
223 text_poke(p->addr, &p->opcode, 1);
224}
225
226void __kprobes arch_remove_kprobe(struct kprobe *p)
227{
228 mutex_lock(&kprobe_mutex);
229 free_insn_slot(p->ainsn.insn, 0);
230 mutex_unlock(&kprobe_mutex);
231}
232
233static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
234{
235 kcb->prev_kprobe.kp = kprobe_running();
236 kcb->prev_kprobe.status = kcb->kprobe_status;
237 kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags;
238 kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
239}
240
241static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
242{
243 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
244 kcb->kprobe_status = kcb->prev_kprobe.status;
245 kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags;
246 kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
247}
248
249static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
250 struct kprobe_ctlblk *kcb)
251{
252 __get_cpu_var(current_kprobe) = p;
253 kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags
254 = (regs->eflags & (TF_MASK | IF_MASK));
255 if (is_IF_modifier(p->ainsn.insn))
256 kcb->kprobe_saved_rflags &= ~IF_MASK;
257}
258
259static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
260{
261 regs->eflags |= TF_MASK;
262 regs->eflags &= ~IF_MASK;
263 /*single step inline if the instruction is an int3*/
264 if (p->opcode == BREAKPOINT_INSTRUCTION)
265 regs->rip = (unsigned long)p->addr;
266 else
267 regs->rip = (unsigned long)p->ainsn.insn;
268}
269
270/* Called with kretprobe_lock held */
271void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
272 struct pt_regs *regs)
273{
274 unsigned long *sara = (unsigned long *)regs->rsp;
275
276 ri->ret_addr = (kprobe_opcode_t *) *sara;
277 /* Replace the return addr with trampoline addr */
278 *sara = (unsigned long) &kretprobe_trampoline;
279}
280
281int __kprobes kprobe_handler(struct pt_regs *regs)
282{
283 struct kprobe *p;
284 int ret = 0;
285 kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
286 struct kprobe_ctlblk *kcb;
287
288 /*
289 * We don't want to be preempted for the entire
290 * duration of kprobe processing
291 */
292 preempt_disable();
293 kcb = get_kprobe_ctlblk();
294
295 /* Check we're not actually recursing */
296 if (kprobe_running()) {
297 p = get_kprobe(addr);
298 if (p) {
299 if (kcb->kprobe_status == KPROBE_HIT_SS &&
300 *p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
301 regs->eflags &= ~TF_MASK;
302 regs->eflags |= kcb->kprobe_saved_rflags;
303 goto no_kprobe;
304 } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) {
305 /* TODO: Provide re-entrancy from
306 * post_kprobes_handler() and avoid exception
307 * stack corruption while single-stepping on
308 * the instruction of the new probe.
309 */
310 arch_disarm_kprobe(p);
311 regs->rip = (unsigned long)p->addr;
312 reset_current_kprobe();
313 ret = 1;
314 } else {
315 /* We have reentered the kprobe_handler(), since
316 * another probe was hit while within the
317 * handler. We here save the original kprobe
318 * variables and just single step on instruction
319 * of the new probe without calling any user
320 * handlers.
321 */
322 save_previous_kprobe(kcb);
323 set_current_kprobe(p, regs, kcb);
324 kprobes_inc_nmissed_count(p);
325 prepare_singlestep(p, regs);
326 kcb->kprobe_status = KPROBE_REENTER;
327 return 1;
328 }
329 } else {
330 if (*addr != BREAKPOINT_INSTRUCTION) {
331 /* The breakpoint instruction was removed by
332 * another cpu right after we hit, no further
333 * handling of this interrupt is appropriate
334 */
335 regs->rip = (unsigned long)addr;
336 ret = 1;
337 goto no_kprobe;
338 }
339 p = __get_cpu_var(current_kprobe);
340 if (p->break_handler && p->break_handler(p, regs)) {
341 goto ss_probe;
342 }
343 }
344 goto no_kprobe;
345 }
346
347 p = get_kprobe(addr);
348 if (!p) {
349 if (*addr != BREAKPOINT_INSTRUCTION) {
350 /*
351 * The breakpoint instruction was removed right
352 * after we hit it. Another cpu has removed
353 * either a probepoint or a debugger breakpoint
354 * at this address. In either case, no further
355 * handling of this interrupt is appropriate.
356 * Back up over the (now missing) int3 and run
357 * the original instruction.
358 */
359 regs->rip = (unsigned long)addr;
360 ret = 1;
361 }
362 /* Not one of ours: let kernel handle it */
363 goto no_kprobe;
364 }
365
366 set_current_kprobe(p, regs, kcb);
367 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
368
369 if (p->pre_handler && p->pre_handler(p, regs))
370 /* handler has already set things up, so skip ss setup */
371 return 1;
372
373ss_probe:
374 prepare_singlestep(p, regs);
375 kcb->kprobe_status = KPROBE_HIT_SS;
376 return 1;
377
378no_kprobe:
379 preempt_enable_no_resched();
380 return ret;
381}
382
383/*
384 * For function-return probes, init_kprobes() establishes a probepoint
385 * here. When a retprobed function returns, this probe is hit and
386 * trampoline_probe_handler() runs, calling the kretprobe's handler.
387 */
388 void kretprobe_trampoline_holder(void)
389 {
390 asm volatile ( ".global kretprobe_trampoline\n"
391 "kretprobe_trampoline: \n"
392 "nop\n");
393 }
394
395/*
396 * Called when we hit the probe point at kretprobe_trampoline
397 */
398int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
399{
400 struct kretprobe_instance *ri = NULL;
401 struct hlist_head *head, empty_rp;
402 struct hlist_node *node, *tmp;
403 unsigned long flags, orig_ret_address = 0;
404 unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline;
405
406 INIT_HLIST_HEAD(&empty_rp);
407 spin_lock_irqsave(&kretprobe_lock, flags);
408 head = kretprobe_inst_table_head(current);
409
410 /*
411 * It is possible to have multiple instances associated with a given
412 * task either because an multiple functions in the call path
413 * have a return probe installed on them, and/or more then one return
414 * return probe was registered for a target function.
415 *
416 * We can handle this because:
417 * - instances are always inserted at the head of the list
418 * - when multiple return probes are registered for the same
419 * function, the first instance's ret_addr will point to the
420 * real return address, and all the rest will point to
421 * kretprobe_trampoline
422 */
423 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
424 if (ri->task != current)
425 /* another task is sharing our hash bucket */
426 continue;
427
428 if (ri->rp && ri->rp->handler)
429 ri->rp->handler(ri, regs);
430
431 orig_ret_address = (unsigned long)ri->ret_addr;
432 recycle_rp_inst(ri, &empty_rp);
433
434 if (orig_ret_address != trampoline_address)
435 /*
436 * This is the real return address. Any other
437 * instances associated with this task are for
438 * other calls deeper on the call stack
439 */
440 break;
441 }
442
443 kretprobe_assert(ri, orig_ret_address, trampoline_address);
444 regs->rip = orig_ret_address;
445
446 reset_current_kprobe();
447 spin_unlock_irqrestore(&kretprobe_lock, flags);
448 preempt_enable_no_resched();
449
450 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
451 hlist_del(&ri->hlist);
452 kfree(ri);
453 }
454 /*
455 * By returning a non-zero value, we are telling
456 * kprobe_handler() that we don't want the post_handler
457 * to run (and have re-enabled preemption)
458 */
459 return 1;
460}
461
462/*
463 * Called after single-stepping. p->addr is the address of the
464 * instruction whose first byte has been replaced by the "int 3"
465 * instruction. To avoid the SMP problems that can occur when we
466 * temporarily put back the original opcode to single-step, we
467 * single-stepped a copy of the instruction. The address of this
468 * copy is p->ainsn.insn.
469 *
470 * This function prepares to return from the post-single-step
471 * interrupt. We have to fix up the stack as follows:
472 *
473 * 0) Except in the case of absolute or indirect jump or call instructions,
474 * the new rip is relative to the copied instruction. We need to make
475 * it relative to the original instruction.
476 *
477 * 1) If the single-stepped instruction was pushfl, then the TF and IF
478 * flags are set in the just-pushed eflags, and may need to be cleared.
479 *
480 * 2) If the single-stepped instruction was a call, the return address
481 * that is atop the stack is the address following the copied instruction.
482 * We need to make it the address following the original instruction.
483 */
484static void __kprobes resume_execution(struct kprobe *p,
485 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
486{
487 unsigned long *tos = (unsigned long *)regs->rsp;
488 unsigned long copy_rip = (unsigned long)p->ainsn.insn;
489 unsigned long orig_rip = (unsigned long)p->addr;
490 kprobe_opcode_t *insn = p->ainsn.insn;
491
492 /*skip the REX prefix*/
493 if (*insn >= 0x40 && *insn <= 0x4f)
494 insn++;
495
496 regs->eflags &= ~TF_MASK;
497 switch (*insn) {
498 case 0x9c: /* pushfl */
499 *tos &= ~(TF_MASK | IF_MASK);
500 *tos |= kcb->kprobe_old_rflags;
501 break;
502 case 0xc2: /* iret/ret/lret */
503 case 0xc3:
504 case 0xca:
505 case 0xcb:
506 case 0xcf:
507 case 0xea: /* jmp absolute -- ip is correct */
508 /* ip is already adjusted, no more changes required */
509 goto no_change;
510 case 0xe8: /* call relative - Fix return addr */
511 *tos = orig_rip + (*tos - copy_rip);
512 break;
513 case 0xff:
514 if ((insn[1] & 0x30) == 0x10) {
515 /* call absolute, indirect */
516 /* Fix return addr; ip is correct. */
517 *tos = orig_rip + (*tos - copy_rip);
518 goto no_change;
519 } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
520 ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
521 /* ip is correct. */
522 goto no_change;
523 }
524 default:
525 break;
526 }
527
528 regs->rip = orig_rip + (regs->rip - copy_rip);
529no_change:
530
531 return;
532}
533
534int __kprobes post_kprobe_handler(struct pt_regs *regs)
535{
536 struct kprobe *cur = kprobe_running();
537 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
538
539 if (!cur)
540 return 0;
541
542 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
543 kcb->kprobe_status = KPROBE_HIT_SSDONE;
544 cur->post_handler(cur, regs, 0);
545 }
546
547 resume_execution(cur, regs, kcb);
548 regs->eflags |= kcb->kprobe_saved_rflags;
549 trace_hardirqs_fixup_flags(regs->eflags);
550
551 /* Restore the original saved kprobes variables and continue. */
552 if (kcb->kprobe_status == KPROBE_REENTER) {
553 restore_previous_kprobe(kcb);
554 goto out;
555 }
556 reset_current_kprobe();
557out:
558 preempt_enable_no_resched();
559
560 /*
561 * if somebody else is singlestepping across a probe point, eflags
562 * will have TF set, in which case, continue the remaining processing
563 * of do_debug, as if this is not a probe hit.
564 */
565 if (regs->eflags & TF_MASK)
566 return 0;
567
568 return 1;
569}
570
571int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
572{
573 struct kprobe *cur = kprobe_running();
574 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
575 const struct exception_table_entry *fixup;
576
577 switch(kcb->kprobe_status) {
578 case KPROBE_HIT_SS:
579 case KPROBE_REENTER:
580 /*
581 * We are here because the instruction being single
582 * stepped caused a page fault. We reset the current
583 * kprobe and the rip points back to the probe address
584 * and allow the page fault handler to continue as a
585 * normal page fault.
586 */
587 regs->rip = (unsigned long)cur->addr;
588 regs->eflags |= kcb->kprobe_old_rflags;
589 if (kcb->kprobe_status == KPROBE_REENTER)
590 restore_previous_kprobe(kcb);
591 else
592 reset_current_kprobe();
593 preempt_enable_no_resched();
594 break;
595 case KPROBE_HIT_ACTIVE:
596 case KPROBE_HIT_SSDONE:
597 /*
598 * We increment the nmissed count for accounting,
599 * we can also use npre/npostfault count for accouting
600 * these specific fault cases.
601 */
602 kprobes_inc_nmissed_count(cur);
603
604 /*
605 * We come here because instructions in the pre/post
606 * handler caused the page_fault, this could happen
607 * if handler tries to access user space by
608 * copy_from_user(), get_user() etc. Let the
609 * user-specified handler try to fix it first.
610 */
611 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
612 return 1;
613
614 /*
615 * In case the user-specified fault handler returned
616 * zero, try to fix up.
617 */
618 fixup = search_exception_tables(regs->rip);
619 if (fixup) {
620 regs->rip = fixup->fixup;
621 return 1;
622 }
623
624 /*
625 * fixup() could not handle it,
626 * Let do_page_fault() fix it.
627 */
628 break;
629 default:
630 break;
631 }
632 return 0;
633}
634
635/*
636 * Wrapper routine for handling exceptions.
637 */
638int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
639 unsigned long val, void *data)
640{
641 struct die_args *args = (struct die_args *)data;
642 int ret = NOTIFY_DONE;
643
644 if (args->regs && user_mode(args->regs))
645 return ret;
646
647 switch (val) {
648 case DIE_INT3:
649 if (kprobe_handler(args->regs))
650 ret = NOTIFY_STOP;
651 break;
652 case DIE_DEBUG:
653 if (post_kprobe_handler(args->regs))
654 ret = NOTIFY_STOP;
655 break;
656 case DIE_GPF:
657 /* kprobe_running() needs smp_processor_id() */
658 preempt_disable();
659 if (kprobe_running() &&
660 kprobe_fault_handler(args->regs, args->trapnr))
661 ret = NOTIFY_STOP;
662 preempt_enable();
663 break;
664 default:
665 break;
666 }
667 return ret;
668}
669
670int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
671{
672 struct jprobe *jp = container_of(p, struct jprobe, kp);
673 unsigned long addr;
674 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
675
676 kcb->jprobe_saved_regs = *regs;
677 kcb->jprobe_saved_rsp = (long *) regs->rsp;
678 addr = (unsigned long)(kcb->jprobe_saved_rsp);
679 /*
680 * As Linus pointed out, gcc assumes that the callee
681 * owns the argument space and could overwrite it, e.g.
682 * tailcall optimization. So, to be absolutely safe
683 * we also save and restore enough stack bytes to cover
684 * the argument area.
685 */
686 memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
687 MIN_STACK_SIZE(addr));
688 regs->eflags &= ~IF_MASK;
689 trace_hardirqs_off();
690 regs->rip = (unsigned long)(jp->entry);
691 return 1;
692}
693
694void __kprobes jprobe_return(void)
695{
696 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
697
698 asm volatile (" xchg %%rbx,%%rsp \n"
699 " int3 \n"
700 " .globl jprobe_return_end \n"
701 " jprobe_return_end: \n"
702 " nop \n"::"b"
703 (kcb->jprobe_saved_rsp):"memory");
704}
705
706int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
707{
708 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
709 u8 *addr = (u8 *) (regs->rip - 1);
710 unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp);
711 struct jprobe *jp = container_of(p, struct jprobe, kp);
712
713 if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
714 if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) {
715 struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
716 printk("current rsp %p does not match saved rsp %p\n",
717 (long *)regs->rsp, kcb->jprobe_saved_rsp);
718 printk("Saved registers for jprobe %p\n", jp);
719 show_registers(saved_regs);
720 printk("Current registers\n");
721 show_registers(regs);
722 BUG();
723 }
724 *regs = kcb->jprobe_saved_regs;
725 memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
726 MIN_STACK_SIZE(stack_addr));
727 preempt_enable_no_resched();
728 return 1;
729 }
730 return 0;
731}
732
733static struct kprobe trampoline_p = {
734 .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
735 .pre_handler = trampoline_probe_handler
736};
737
738int __init arch_init_kprobes(void)
739{
740 return register_kprobe(&trampoline_p);
741}
742
743int __kprobes arch_trampoline_kprobe(struct kprobe *p)
744{
745 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
746 return 1;
747
748 return 0;
749}
diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt.c
index 9ff90a27c45f..0224c3637c73 100644
--- a/arch/x86/kernel/ldt_32.c
+++ b/arch/x86/kernel/ldt.c
@@ -1,6 +1,9 @@
1/* 1/*
2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds 2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4 * Copyright (C) 2002 Andi Kleen
5 *
6 * This handles calls from both 32bit and 64bit mode.
4 */ 7 */
5 8
6#include <linux/errno.h> 9#include <linux/errno.h>
@@ -9,7 +12,6 @@
9#include <linux/mm.h> 12#include <linux/mm.h>
10#include <linux/smp.h> 13#include <linux/smp.h>
11#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
12#include <linux/slab.h>
13 15
14#include <asm/uaccess.h> 16#include <asm/uaccess.h>
15#include <asm/system.h> 17#include <asm/system.h>
@@ -17,7 +19,7 @@
17#include <asm/desc.h> 19#include <asm/desc.h>
18#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
19 21
20#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ 22#ifdef CONFIG_SMP
21static void flush_ldt(void *null) 23static void flush_ldt(void *null)
22{ 24{
23 if (current->active_mm) 25 if (current->active_mm)
@@ -27,26 +29,32 @@ static void flush_ldt(void *null)
27 29
28static int alloc_ldt(mm_context_t *pc, int mincount, int reload) 30static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
29{ 31{
30 void *oldldt; 32 void *oldldt, *newldt;
31 void *newldt;
32 int oldsize; 33 int oldsize;
33 34
34 if (mincount <= pc->size) 35 if (mincount <= pc->size)
35 return 0; 36 return 0;
36 oldsize = pc->size; 37 oldsize = pc->size;
37 mincount = (mincount+511)&(~511); 38 mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
38 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) 39 (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
39 newldt = vmalloc(mincount*LDT_ENTRY_SIZE); 40 if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
41 newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
40 else 42 else
41 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); 43 newldt = (void *)__get_free_page(GFP_KERNEL);
42 44
43 if (!newldt) 45 if (!newldt)
44 return -ENOMEM; 46 return -ENOMEM;
45 47
46 if (oldsize) 48 if (oldsize)
47 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); 49 memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
48 oldldt = pc->ldt; 50 oldldt = pc->ldt;
49 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); 51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE);
53
54#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */
56 wmb();
57#endif
50 pc->ldt = newldt; 58 pc->ldt = newldt;
51 wmb(); 59 wmb();
52 pc->size = mincount; 60 pc->size = mincount;
@@ -55,6 +63,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
55 if (reload) { 63 if (reload) {
56#ifdef CONFIG_SMP 64#ifdef CONFIG_SMP
57 cpumask_t mask; 65 cpumask_t mask;
66
58 preempt_disable(); 67 preempt_disable();
59 load_LDT(pc); 68 load_LDT(pc);
60 mask = cpumask_of_cpu(smp_processor_id()); 69 mask = cpumask_of_cpu(smp_processor_id());
@@ -66,10 +75,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
66#endif 75#endif
67 } 76 }
68 if (oldsize) { 77 if (oldsize) {
69 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) 78 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
70 vfree(oldldt); 79 vfree(oldldt);
71 else 80 else
72 kfree(oldldt); 81 put_page(virt_to_page(oldldt));
73 } 82 }
74 return 0; 83 return 0;
75} 84}
@@ -77,9 +86,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
77static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 86static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
78{ 87{
79 int err = alloc_ldt(new, old->size, 0); 88 int err = alloc_ldt(new, old->size, 0);
89
80 if (err < 0) 90 if (err < 0)
81 return err; 91 return err;
82 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); 92 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
83 return 0; 93 return 0;
84} 94}
85 95
@@ -89,7 +99,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
89 */ 99 */
90int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 100int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
91{ 101{
92 struct mm_struct * old_mm; 102 struct mm_struct *old_mm;
93 int retval = 0; 103 int retval = 0;
94 104
95 mutex_init(&mm->context.lock); 105 mutex_init(&mm->context.lock);
@@ -105,33 +115,38 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
105 115
106/* 116/*
107 * No need to lock the MM as we are the last user 117 * No need to lock the MM as we are the last user
118 *
119 * 64bit: Don't touch the LDT register - we're already in the next thread.
108 */ 120 */
109void destroy_context(struct mm_struct *mm) 121void destroy_context(struct mm_struct *mm)
110{ 122{
111 if (mm->context.size) { 123 if (mm->context.size) {
124#ifdef CONFIG_X86_32
125 /* CHECKME: Can this ever happen ? */
112 if (mm == current->active_mm) 126 if (mm == current->active_mm)
113 clear_LDT(); 127 clear_LDT();
114 if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) 128#endif
129 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
115 vfree(mm->context.ldt); 130 vfree(mm->context.ldt);
116 else 131 else
117 kfree(mm->context.ldt); 132 put_page(virt_to_page(mm->context.ldt));
118 mm->context.size = 0; 133 mm->context.size = 0;
119 } 134 }
120} 135}
121 136
122static int read_ldt(void __user * ptr, unsigned long bytecount) 137static int read_ldt(void __user *ptr, unsigned long bytecount)
123{ 138{
124 int err; 139 int err;
125 unsigned long size; 140 unsigned long size;
126 struct mm_struct * mm = current->mm; 141 struct mm_struct *mm = current->mm;
127 142
128 if (!mm->context.size) 143 if (!mm->context.size)
129 return 0; 144 return 0;
130 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) 145 if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
131 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; 146 bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
132 147
133 mutex_lock(&mm->context.lock); 148 mutex_lock(&mm->context.lock);
134 size = mm->context.size*LDT_ENTRY_SIZE; 149 size = mm->context.size * LDT_ENTRY_SIZE;
135 if (size > bytecount) 150 if (size > bytecount)
136 size = bytecount; 151 size = bytecount;
137 152
@@ -143,7 +158,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
143 goto error_return; 158 goto error_return;
144 if (size != bytecount) { 159 if (size != bytecount) {
145 /* zero-fill the rest */ 160 /* zero-fill the rest */
146 if (clear_user(ptr+size, bytecount-size) != 0) { 161 if (clear_user(ptr + size, bytecount - size) != 0) {
147 err = -EFAULT; 162 err = -EFAULT;
148 goto error_return; 163 goto error_return;
149 } 164 }
@@ -153,34 +168,32 @@ error_return:
153 return err; 168 return err;
154} 169}
155 170
156static int read_default_ldt(void __user * ptr, unsigned long bytecount) 171static int read_default_ldt(void __user *ptr, unsigned long bytecount)
157{ 172{
158 int err; 173 /* CHECKME: Can we use _one_ random number ? */
159 unsigned long size; 174#ifdef CONFIG_X86_32
160 175 unsigned long size = 5 * sizeof(struct desc_struct);
161 err = 0; 176#else
162 size = 5*sizeof(struct desc_struct); 177 unsigned long size = 128;
163 if (size > bytecount) 178#endif
164 size = bytecount; 179 if (bytecount > size)
165 180 bytecount = size;
166 err = size; 181 if (clear_user(ptr, bytecount))
167 if (clear_user(ptr, size)) 182 return -EFAULT;
168 err = -EFAULT; 183 return bytecount;
169
170 return err;
171} 184}
172 185
173static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) 186static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
174{ 187{
175 struct mm_struct * mm = current->mm; 188 struct mm_struct *mm = current->mm;
176 __u32 entry_1, entry_2; 189 struct desc_struct ldt;
177 int error; 190 int error;
178 struct user_desc ldt_info; 191 struct user_desc ldt_info;
179 192
180 error = -EINVAL; 193 error = -EINVAL;
181 if (bytecount != sizeof(ldt_info)) 194 if (bytecount != sizeof(ldt_info))
182 goto out; 195 goto out;
183 error = -EFAULT; 196 error = -EFAULT;
184 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) 197 if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
185 goto out; 198 goto out;
186 199
@@ -196,28 +209,27 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
196 209
197 mutex_lock(&mm->context.lock); 210 mutex_lock(&mm->context.lock);
198 if (ldt_info.entry_number >= mm->context.size) { 211 if (ldt_info.entry_number >= mm->context.size) {
199 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1); 212 error = alloc_ldt(&current->mm->context,
213 ldt_info.entry_number + 1, 1);
200 if (error < 0) 214 if (error < 0)
201 goto out_unlock; 215 goto out_unlock;
202 } 216 }
203 217
204 /* Allow LDTs to be cleared by the user. */ 218 /* Allow LDTs to be cleared by the user. */
205 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { 219 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
206 if (oldmode || LDT_empty(&ldt_info)) { 220 if (oldmode || LDT_empty(&ldt_info)) {
207 entry_1 = 0; 221 memset(&ldt, 0, sizeof(ldt));
208 entry_2 = 0;
209 goto install; 222 goto install;
210 } 223 }
211 } 224 }
212 225
213 entry_1 = LDT_entry_a(&ldt_info); 226 fill_ldt(&ldt, &ldt_info);
214 entry_2 = LDT_entry_b(&ldt_info);
215 if (oldmode) 227 if (oldmode)
216 entry_2 &= ~(1 << 20); 228 ldt.avl = 0;
217 229
218 /* Install the new entry ... */ 230 /* Install the new entry ... */
219install: 231install:
220 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); 232 write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
221 error = 0; 233 error = 0;
222 234
223out_unlock: 235out_unlock:
@@ -226,7 +238,8 @@ out:
226 return error; 238 return error;
227} 239}
228 240
229asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) 241asmlinkage int sys_modify_ldt(int func, void __user *ptr,
242 unsigned long bytecount)
230{ 243{
231 int ret = -ENOSYS; 244 int ret = -ENOSYS;
232 245
diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c
deleted file mode 100644
index 60e57abb8e90..000000000000
--- a/arch/x86/kernel/ldt_64.c
+++ /dev/null
@@ -1,250 +0,0 @@
1/*
2 * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
3 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
4 * Copyright (C) 2002 Andi Kleen
5 *
6 * This handles calls from both 32bit and 64bit mode.
7 */
8
9#include <linux/errno.h>
10#include <linux/sched.h>
11#include <linux/string.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16
17#include <asm/uaccess.h>
18#include <asm/system.h>
19#include <asm/ldt.h>
20#include <asm/desc.h>
21#include <asm/proto.h>
22
23#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
24static void flush_ldt(void *null)
25{
26 if (current->active_mm)
27 load_LDT(&current->active_mm->context);
28}
29#endif
30
31static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
32{
33 void *oldldt;
34 void *newldt;
35 unsigned oldsize;
36
37 if (mincount <= (unsigned)pc->size)
38 return 0;
39 oldsize = pc->size;
40 mincount = (mincount+511)&(~511);
41 if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
42 newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
43 else
44 newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
45
46 if (!newldt)
47 return -ENOMEM;
48
49 if (oldsize)
50 memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
51 oldldt = pc->ldt;
52 memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
53 wmb();
54 pc->ldt = newldt;
55 wmb();
56 pc->size = mincount;
57 wmb();
58 if (reload) {
59#ifdef CONFIG_SMP
60 cpumask_t mask;
61
62 preempt_disable();
63 mask = cpumask_of_cpu(smp_processor_id());
64 load_LDT(pc);
65 if (!cpus_equal(current->mm->cpu_vm_mask, mask))
66 smp_call_function(flush_ldt, NULL, 1, 1);
67 preempt_enable();
68#else
69 load_LDT(pc);
70#endif
71 }
72 if (oldsize) {
73 if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
74 vfree(oldldt);
75 else
76 kfree(oldldt);
77 }
78 return 0;
79}
80
81static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
82{
83 int err = alloc_ldt(new, old->size, 0);
84 if (err < 0)
85 return err;
86 memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
87 return 0;
88}
89
90/*
91 * we do not have to muck with descriptors here, that is
92 * done in switch_mm() as needed.
93 */
94int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
95{
96 struct mm_struct * old_mm;
97 int retval = 0;
98
99 mutex_init(&mm->context.lock);
100 mm->context.size = 0;
101 old_mm = current->mm;
102 if (old_mm && old_mm->context.size > 0) {
103 mutex_lock(&old_mm->context.lock);
104 retval = copy_ldt(&mm->context, &old_mm->context);
105 mutex_unlock(&old_mm->context.lock);
106 }
107 return retval;
108}
109
110/*
111 *
112 * Don't touch the LDT register - we're already in the next thread.
113 */
114void destroy_context(struct mm_struct *mm)
115{
116 if (mm->context.size) {
117 if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
118 vfree(mm->context.ldt);
119 else
120 kfree(mm->context.ldt);
121 mm->context.size = 0;
122 }
123}
124
125static int read_ldt(void __user * ptr, unsigned long bytecount)
126{
127 int err;
128 unsigned long size;
129 struct mm_struct * mm = current->mm;
130
131 if (!mm->context.size)
132 return 0;
133 if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
134 bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
135
136 mutex_lock(&mm->context.lock);
137 size = mm->context.size*LDT_ENTRY_SIZE;
138 if (size > bytecount)
139 size = bytecount;
140
141 err = 0;
142 if (copy_to_user(ptr, mm->context.ldt, size))
143 err = -EFAULT;
144 mutex_unlock(&mm->context.lock);
145 if (err < 0)
146 goto error_return;
147 if (size != bytecount) {
148 /* zero-fill the rest */
149 if (clear_user(ptr+size, bytecount-size) != 0) {
150 err = -EFAULT;
151 goto error_return;
152 }
153 }
154 return bytecount;
155error_return:
156 return err;
157}
158
159static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{
161 /* Arbitrary number */
162 /* x86-64 default LDT is all zeros */
163 if (bytecount > 128)
164 bytecount = 128;
165 if (clear_user(ptr, bytecount))
166 return -EFAULT;
167 return bytecount;
168}
169
170static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
171{
172 struct task_struct *me = current;
173 struct mm_struct * mm = me->mm;
174 __u32 entry_1, entry_2, *lp;
175 int error;
176 struct user_desc ldt_info;
177
178 error = -EINVAL;
179
180 if (bytecount != sizeof(ldt_info))
181 goto out;
182 error = -EFAULT;
183 if (copy_from_user(&ldt_info, ptr, bytecount))
184 goto out;
185
186 error = -EINVAL;
187 if (ldt_info.entry_number >= LDT_ENTRIES)
188 goto out;
189 if (ldt_info.contents == 3) {
190 if (oldmode)
191 goto out;
192 if (ldt_info.seg_not_present == 0)
193 goto out;
194 }
195
196 mutex_lock(&mm->context.lock);
197 if (ldt_info.entry_number >= (unsigned)mm->context.size) {
198 error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
199 if (error < 0)
200 goto out_unlock;
201 }
202
203 lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
204
205 /* Allow LDTs to be cleared by the user. */
206 if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
207 if (oldmode || LDT_empty(&ldt_info)) {
208 entry_1 = 0;
209 entry_2 = 0;
210 goto install;
211 }
212 }
213
214 entry_1 = LDT_entry_a(&ldt_info);
215 entry_2 = LDT_entry_b(&ldt_info);
216 if (oldmode)
217 entry_2 &= ~(1 << 20);
218
219 /* Install the new entry ... */
220install:
221 *lp = entry_1;
222 *(lp+1) = entry_2;
223 error = 0;
224
225out_unlock:
226 mutex_unlock(&mm->context.lock);
227out:
228 return error;
229}
230
231asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
232{
233 int ret = -ENOSYS;
234
235 switch (func) {
236 case 0:
237 ret = read_ldt(ptr, bytecount);
238 break;
239 case 1:
240 ret = write_ldt(ptr, bytecount, 1);
241 break;
242 case 2:
243 ret = read_default_ldt(ptr, bytecount);
244 break;
245 case 0x11:
246 ret = write_ldt(ptr, bytecount, 0);
247 break;
248 }
249 return ret;
250}
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 11b935f4f886..c1cfd60639d4 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED;
32 32
33static void set_idt(void *newidt, __u16 limit) 33static void set_idt(void *newidt, __u16 limit)
34{ 34{
35 struct Xgt_desc_struct curidt; 35 struct desc_ptr curidt;
36 36
37 /* ia32 supports unaliged loads & stores */ 37 /* ia32 supports unaliged loads & stores */
38 curidt.size = limit; 38 curidt.size = limit;
@@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit)
44 44
45static void set_gdt(void *newgdt, __u16 limit) 45static void set_gdt(void *newgdt, __u16 limit)
46{ 46{
47 struct Xgt_desc_struct curgdt; 47 struct desc_ptr curgdt;
48 48
49 /* ia32 supports unaligned loads & stores */ 49 /* ia32 supports unaligned loads & stores */
50 curgdt.size = limit; 50 curgdt.size = limit;
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index aa3d2c8f7737..a1fef42f8cdb 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image)
234void arch_crash_save_vmcoreinfo(void) 234void arch_crash_save_vmcoreinfo(void)
235{ 235{
236 VMCOREINFO_SYMBOL(init_level4_pgt); 236 VMCOREINFO_SYMBOL(init_level4_pgt);
237
238#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE
239 VMCOREINFO_SYMBOL(node_data);
240 VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
241#endif
242} 237}
243 238
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 0ab680f2d9db..219f86eb6123 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s)
63} 63}
64__setup("nomfgpt", mfgpt_disable); 64__setup("nomfgpt", mfgpt_disable);
65 65
66/* Reset the MFGPT timers. This is required by some broken BIOSes which already
67 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
68 * affected at least (0.99 is OK with MFGPT workaround left to off).
69 */
70static int __init mfgpt_fix(char *s)
71{
72 u32 val, dummy;
73
74 /* The following udocumented bit resets the MFGPT timers */
75 val = 0xFF; dummy = 0;
76 wrmsr(0x5140002B, val, dummy);
77 return 1;
78}
79__setup("mfgptfix", mfgpt_fix);
80
66/* 81/*
67 * Check whether any MFGPTs are available for the kernel to use. In most 82 * Check whether any MFGPTs are available for the kernel to use. In most
68 * cases, firmware that uses AMD's VSA code will claim all timers during 83 * cases, firmware that uses AMD's VSA code will claim all timers during
@@ -278,12 +293,12 @@ static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
278 293
279static irqreturn_t mfgpt_tick(int irq, void *dev_id) 294static irqreturn_t mfgpt_tick(int irq, void *dev_id)
280{ 295{
296 /* Turn off the clock (and clear the event) */
297 mfgpt_disable_timer(mfgpt_event_clock);
298
281 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN) 299 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
282 return IRQ_HANDLED; 300 return IRQ_HANDLED;
283 301
284 /* Turn off the clock */
285 mfgpt_disable_timer(mfgpt_event_clock);
286
287 /* Clear the counter */ 302 /* Clear the counter */
288 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0); 303 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
289 304
@@ -319,10 +334,6 @@ static int __init mfgpt_timer_setup(void)
319 } 334 }
320 335
321 mfgpt_event_clock = timer; 336 mfgpt_event_clock = timer;
322 /* Set the clock scale and enable the event mode for CMP2 */
323 val = MFGPT_SCALE | (3 << 8);
324
325 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
326 337
327 /* Set up the IRQ on the MFGPT side */ 338 /* Set up the IRQ on the MFGPT side */
328 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 339 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) {
@@ -339,6 +350,11 @@ static int __init mfgpt_timer_setup(void)
339 goto err; 350 goto err;
340 } 351 }
341 352
353 /* Set the clock scale and enable the event mode for CMP2 */
354 val = MFGPT_SCALE | (3 << 8);
355
356 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
357
342 /* Set up the clock event */ 358 /* Set up the clock event */
343 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, 32); 359 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, 32);
344 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF, 360 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 09c315214a5e..f2702d01b8a8 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc)
244 return 0; 244 return 0;
245 /* check extended signature checksum */ 245 /* check extended signature checksum */
246 for (i = 0; i < ext_sigcount; i++) { 246 for (i = 0; i < ext_sigcount; i++) {
247 ext_sig = (struct extended_signature *)((void *)ext_header 247 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
248 + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); 248 EXT_SIGNATURE_SIZE * i;
249 sum = orig_sum 249 sum = orig_sum
250 - (mc_header->sig + mc_header->pf + mc_header->cksum) 250 - (mc_header->sig + mc_header->pf + mc_header->cksum)
251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 251 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
@@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu)
279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) 279 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
280 return 0; 280 return 0;
281 281
282 ext_header = (struct extended_sigtable *)(mc + 282 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
283 get_datasize(mc_header) + MC_HEADER_SIZE);
284 ext_sigcount = ext_header->count; 283 ext_sigcount = ext_header->count;
285 ext_sig = (struct extended_signature *)((void *)ext_header 284 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
286 + EXT_HEADER_SIZE);
287 for (i = 0; i < ext_sigcount; i++) { 285 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header, 286 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf)) 287 ext_sig->sig, ext_sig->pf))
@@ -436,7 +434,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
436 return -EINVAL; 434 return -EINVAL;
437 } 435 }
438 436
439 lock_cpu_hotplug(); 437 get_online_cpus();
440 mutex_lock(&microcode_mutex); 438 mutex_lock(&microcode_mutex);
441 439
442 user_buffer = (void __user *) buf; 440 user_buffer = (void __user *) buf;
@@ -447,7 +445,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
447 ret = (ssize_t)len; 445 ret = (ssize_t)len;
448 446
449 mutex_unlock(&microcode_mutex); 447 mutex_unlock(&microcode_mutex);
450 unlock_cpu_hotplug(); 448 put_online_cpus();
451 449
452 return ret; 450 return ret;
453} 451}
@@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu)
539 pr_debug("ucode data file %s load failed\n", name); 537 pr_debug("ucode data file %s load failed\n", name);
540 return error; 538 return error;
541 } 539 }
542 buf = (void *)firmware->data; 540 buf = firmware->data;
543 size = firmware->size; 541 size = firmware->size;
544 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) 542 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
545 > 0) { 543 > 0) {
@@ -658,14 +656,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
658 656
659 old = current->cpus_allowed; 657 old = current->cpus_allowed;
660 658
661 lock_cpu_hotplug(); 659 get_online_cpus();
662 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 660 set_cpus_allowed(current, cpumask_of_cpu(cpu));
663 661
664 mutex_lock(&microcode_mutex); 662 mutex_lock(&microcode_mutex);
665 if (uci->valid) 663 if (uci->valid)
666 err = cpu_request_microcode(cpu); 664 err = cpu_request_microcode(cpu);
667 mutex_unlock(&microcode_mutex); 665 mutex_unlock(&microcode_mutex);
668 unlock_cpu_hotplug(); 666 put_online_cpus();
669 set_cpus_allowed(current, old); 667 set_cpus_allowed(current, old);
670 } 668 }
671 if (err) 669 if (err)
@@ -799,7 +797,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
799 return NOTIFY_OK; 797 return NOTIFY_OK;
800} 798}
801 799
802static struct notifier_block __cpuinitdata mc_cpu_notifier = { 800static struct notifier_block __refdata mc_cpu_notifier = {
803 .notifier_call = mc_cpu_callback, 801 .notifier_call = mc_cpu_callback,
804}; 802};
805 803
@@ -817,9 +815,9 @@ static int __init microcode_init (void)
817 return PTR_ERR(microcode_pdev); 815 return PTR_ERR(microcode_pdev);
818 } 816 }
819 817
820 lock_cpu_hotplug(); 818 get_online_cpus();
821 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); 819 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
822 unlock_cpu_hotplug(); 820 put_online_cpus();
823 if (error) { 821 if (error) {
824 microcode_dev_exit(); 822 microcode_dev_exit();
825 platform_device_unregister(microcode_pdev); 823 platform_device_unregister(microcode_pdev);
@@ -839,9 +837,9 @@ static void __exit microcode_exit (void)
839 837
840 unregister_hotcpu_notifier(&mc_cpu_notifier); 838 unregister_hotcpu_notifier(&mc_cpu_notifier);
841 839
842 lock_cpu_hotplug(); 840 get_online_cpus();
843 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); 841 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
844 unlock_cpu_hotplug(); 842 put_online_cpus();
845 843
846 platform_device_unregister(microcode_pdev); 844 platform_device_unregister(microcode_pdev);
847} 845}
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c
index 7a05a7f6099a..67009cdd5eca 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse_32.c
@@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0;
68/* Processor that is doing the boot up */ 68/* Processor that is doing the boot up */
69unsigned int boot_cpu_physical_apicid = -1U; 69unsigned int boot_cpu_physical_apicid = -1U;
70/* Internal processor count */ 70/* Internal processor count */
71unsigned int __cpuinitdata num_processors; 71unsigned int num_processors;
72 72
73/* Bitmask of physically existing CPUs */ 73/* Bitmask of physically existing CPUs */
74physid_mask_t phys_cpu_present_map; 74physid_mask_t phys_cpu_present_map;
@@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
258 if (!(m->mpc_flags & MPC_APIC_USABLE)) 258 if (!(m->mpc_flags & MPC_APIC_USABLE))
259 return; 259 return;
260 260
261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", 261 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); 262 m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
263 if (nr_ioapics >= MAX_IO_APICS) { 263 if (nr_ioapics >= MAX_IO_APICS) {
264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", 264 printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
@@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
405 405
406 mps_oem_check(mpc, oem, str); 406 mps_oem_check(mpc, oem, str);
407 407
408 printk("APIC at: 0x%lX\n",mpc->mpc_lapic); 408 printk("APIC at: 0x%X\n", mpc->mpc_lapic);
409 409
410 /* 410 /*
411 * Save the local APIC address (it might be non-default) -- but only 411 * Save the local APIC address (it might be non-default) -- but only
412 * if we're not using ACPI. 412 * if we're not using ACPI.
413 */ 413 */
@@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
721 unsigned long *bp = phys_to_virt(base); 721 unsigned long *bp = phys_to_virt(base);
722 struct intel_mp_floating *mpf; 722 struct intel_mp_floating *mpf;
723 723
724 Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); 724 printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
725 if (sizeof(*mpf) != 16) 725 if (sizeof(*mpf) != 16)
726 printk("Error: MPF size\n"); 726 printk("Error: MPF size\n");
727 727
@@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
734 || (mpf->mpf_specification == 4)) ) { 734 || (mpf->mpf_specification == 4)) ) {
735 735
736 smp_found_config = 1; 736 smp_found_config = 1;
737 printk(KERN_INFO "found SMP MP-table at %08lx\n", 737 printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
738 virt_to_phys(mpf)); 738 mpf, virt_to_phys(mpf));
739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); 739 reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
740 if (mpf->mpf_physptr) { 740 if (mpf->mpf_physptr) {
741 /* 741 /*
@@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
918 */ 918 */
919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; 919 mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
920 mp_ioapic_routing[idx].gsi_base = gsi_base; 920 mp_ioapic_routing[idx].gsi_base = gsi_base;
921 mp_ioapic_routing[idx].gsi_end = gsi_base + 921 mp_ioapic_routing[idx].gsi_end = gsi_base +
922 io_apic_get_redir_entries(idx); 922 io_apic_get_redir_entries(idx);
923 923
924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " 924 printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 925 "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, 926 mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
927 mp_ioapic_routing[idx].gsi_base, 927 mp_ioapic_routing[idx].gsi_base,
928 mp_ioapic_routing[idx].gsi_end); 928 mp_ioapic_routing[idx].gsi_end);
929} 929}
930 930
931void __init 931void __init
@@ -1041,15 +1041,16 @@ void __init mp_config_acpi_legacy_irqs (void)
1041} 1041}
1042 1042
1043#define MAX_GSI_NUM 4096 1043#define MAX_GSI_NUM 4096
1044#define IRQ_COMPRESSION_START 64
1044 1045
1045int mp_register_gsi(u32 gsi, int triggering, int polarity) 1046int mp_register_gsi(u32 gsi, int triggering, int polarity)
1046{ 1047{
1047 int ioapic = -1; 1048 int ioapic = -1;
1048 int ioapic_pin = 0; 1049 int ioapic_pin = 0;
1049 int idx, bit = 0; 1050 int idx, bit = 0;
1050 static int pci_irq = 16; 1051 static int pci_irq = IRQ_COMPRESSION_START;
1051 /* 1052 /*
1052 * Mapping between Global System Interrups, which 1053 * Mapping between Global System Interrupts, which
1053 * represent all possible interrupts, and IRQs 1054 * represent all possible interrupts, and IRQs
1054 * assigned to actual devices. 1055 * assigned to actual devices.
1055 */ 1056 */
@@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1086 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { 1087 if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
1087 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1088 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
1088 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1089 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1089 return gsi_to_irq[gsi]; 1090 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1090 } 1091 }
1091 1092
1092 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); 1093 mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
1093 1094
1094 if (triggering == ACPI_LEVEL_SENSITIVE) { 1095 /*
1096 * For GSI >= 64, use IRQ compression
1097 */
1098 if ((gsi >= IRQ_COMPRESSION_START)
1099 && (triggering == ACPI_LEVEL_SENSITIVE)) {
1095 /* 1100 /*
1096 * For PCI devices assign IRQs in order, avoiding gaps 1101 * For PCI devices assign IRQs in order, avoiding gaps
1097 * due to unused I/O APIC pins. 1102 * due to unused I/O APIC pins.
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index ef4aab123581..72ab1403fed7 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -60,14 +60,18 @@ unsigned int boot_cpu_id = -1U;
60EXPORT_SYMBOL(boot_cpu_id); 60EXPORT_SYMBOL(boot_cpu_id);
61 61
62/* Internal processor count */ 62/* Internal processor count */
63unsigned int num_processors __cpuinitdata = 0; 63unsigned int num_processors;
64 64
65unsigned disabled_cpus __cpuinitdata; 65unsigned disabled_cpus __cpuinitdata;
66 66
67/* Bitmask of physically existing CPUs */ 67/* Bitmask of physically existing CPUs */
68physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; 68physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
69 69
70u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; 70u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
71 = { [0 ... NR_CPUS-1] = BAD_APICID };
72void *x86_bios_cpu_apicid_early_ptr;
73DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
74EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
71 75
72 76
73/* 77/*
@@ -118,24 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
118 physid_set(m->mpc_apicid, phys_cpu_present_map); 122 physid_set(m->mpc_apicid, phys_cpu_present_map);
119 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 123 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
120 /* 124 /*
121 * bios_cpu_apicid is required to have processors listed 125 * x86_bios_cpu_apicid is required to have processors listed
122 * in same order as logical cpu numbers. Hence the first 126 * in same order as logical cpu numbers. Hence the first
123 * entry is BSP, and so on. 127 * entry is BSP, and so on.
124 */ 128 */
125 cpu = 0; 129 cpu = 0;
126 } 130 }
127 bios_cpu_apicid[cpu] = m->mpc_apicid; 131 /* are we being called early in kernel startup? */
128 /* 132 if (x86_cpu_to_apicid_early_ptr) {
129 * We get called early in the the start_kernel initialization 133 u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
130 * process when the per_cpu data area is not yet setup, so we 134 u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
131 * use a static array that is removed after the per_cpu data 135
132 * area is created. 136 cpu_to_apicid[cpu] = m->mpc_apicid;
133 */ 137 bios_cpu_apicid[cpu] = m->mpc_apicid;
134 if (x86_cpu_to_apicid_ptr) {
135 u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
136 x86_cpu_to_apicid[cpu] = m->mpc_apicid;
137 } else { 138 } else {
138 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; 139 per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
140 per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
139 } 141 }
140 142
141 cpu_set(cpu, cpu_possible_map); 143 cpu_set(cpu, cpu_possible_map);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index ee6eba4ecfea..af51ea8400b2 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -1,6 +1,6 @@
1/* ----------------------------------------------------------------------- * 1/* ----------------------------------------------------------------------- *
2 * 2 *
3 * Copyright 2000 H. Peter Anvin - All Rights Reserved 3 * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -45,9 +45,10 @@ static struct class *msr_class;
45 45
46static loff_t msr_seek(struct file *file, loff_t offset, int orig) 46static loff_t msr_seek(struct file *file, loff_t offset, int orig)
47{ 47{
48 loff_t ret = -EINVAL; 48 loff_t ret;
49 struct inode *inode = file->f_mapping->host;
49 50
50 lock_kernel(); 51 mutex_lock(&inode->i_mutex);
51 switch (orig) { 52 switch (orig) {
52 case 0: 53 case 0:
53 file->f_pos = offset; 54 file->f_pos = offset;
@@ -56,8 +57,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig)
56 case 1: 57 case 1:
57 file->f_pos += offset; 58 file->f_pos += offset;
58 ret = file->f_pos; 59 ret = file->f_pos;
60 break;
61 default:
62 ret = -EINVAL;
59 } 63 }
60 unlock_kernel(); 64 mutex_unlock(&inode->i_mutex);
61 return ret; 65 return ret;
62} 66}
63 67
@@ -155,20 +159,20 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
155 159
156 switch (action) { 160 switch (action) {
157 case CPU_UP_PREPARE: 161 case CPU_UP_PREPARE:
158 case CPU_UP_PREPARE_FROZEN:
159 err = msr_device_create(cpu); 162 err = msr_device_create(cpu);
160 break; 163 break;
161 case CPU_UP_CANCELED: 164 case CPU_UP_CANCELED:
162 case CPU_UP_CANCELED_FROZEN:
163 case CPU_DEAD: 165 case CPU_DEAD:
164 case CPU_DEAD_FROZEN:
165 msr_device_destroy(cpu); 166 msr_device_destroy(cpu);
166 break; 167 break;
168 case CPU_UP_CANCELED_FROZEN:
169 destroy_suspended_device(msr_class, MKDEV(MSR_MAJOR, cpu));
170 break;
167 } 171 }
168 return err ? NOTIFY_BAD : NOTIFY_OK; 172 return err ? NOTIFY_BAD : NOTIFY_OK;
169} 173}
170 174
171static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { 175static struct notifier_block __refdata msr_class_cpu_notifier = {
172 .notifier_call = msr_class_cpu_callback, 176 .notifier_call = msr_class_cpu_callback,
173}; 177};
174 178
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 80ca72e5ac29..edd413650b3b 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -25,7 +25,6 @@
25 25
26#include <asm/smp.h> 26#include <asm/smp.h>
27#include <asm/nmi.h> 27#include <asm/nmi.h>
28#include <asm/timer.h>
29 28
30#include "mach_traps.h" 29#include "mach_traps.h"
31 30
@@ -52,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
52 51
53static int endflag __initdata = 0; 52static int endflag __initdata = 0;
54 53
54#ifdef CONFIG_SMP
55/* The performance counters used by NMI_LOCAL_APIC don't trigger when 55/* The performance counters used by NMI_LOCAL_APIC don't trigger when
56 * the CPU is idle. To make sure the NMI watchdog really ticks on all 56 * the CPU is idle. To make sure the NMI watchdog really ticks on all
57 * CPUs during the test make them busy. 57 * CPUs during the test make them busy.
58 */ 58 */
59static __init void nmi_cpu_busy(void *data) 59static __init void nmi_cpu_busy(void *data)
60{ 60{
61#ifdef CONFIG_SMP
62 local_irq_enable_in_hardirq(); 61 local_irq_enable_in_hardirq();
63 /* Intentionally don't use cpu_relax here. This is 62 /* Intentionally don't use cpu_relax here. This is
64 to make sure that the performance counter really ticks, 63 to make sure that the performance counter really ticks,
@@ -68,8 +67,8 @@ static __init void nmi_cpu_busy(void *data)
68 care if they get somewhat less cycles. */ 67 care if they get somewhat less cycles. */
69 while (endflag == 0) 68 while (endflag == 0)
70 mb(); 69 mb();
71#endif
72} 70}
71#endif
73 72
74static int __init check_nmi_watchdog(void) 73static int __init check_nmi_watchdog(void)
75{ 74{
@@ -84,15 +83,17 @@ static int __init check_nmi_watchdog(void)
84 83
85 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 84 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
86 if (!prev_nmi_count) 85 if (!prev_nmi_count)
87 goto error; 86 return -1;
88 87
89 printk(KERN_INFO "Testing NMI watchdog ... "); 88 printk(KERN_INFO "Testing NMI watchdog ... ");
90 89
90#ifdef CONFIG_SMP
91 if (nmi_watchdog == NMI_LOCAL_APIC) 91 if (nmi_watchdog == NMI_LOCAL_APIC)
92 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); 92 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
93#endif
93 94
94 for_each_possible_cpu(cpu) 95 for_each_possible_cpu(cpu)
95 prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; 96 prev_nmi_count[cpu] = nmi_count(cpu);
96 local_irq_enable(); 97 local_irq_enable();
97 mdelay((20*1000)/nmi_hz); // wait 20 ticks 98 mdelay((20*1000)/nmi_hz); // wait 20 ticks
98 99
@@ -119,7 +120,7 @@ static int __init check_nmi_watchdog(void)
119 if (!atomic_read(&nmi_active)) { 120 if (!atomic_read(&nmi_active)) {
120 kfree(prev_nmi_count); 121 kfree(prev_nmi_count);
121 atomic_set(&nmi_active, -1); 122 atomic_set(&nmi_active, -1);
122 goto error; 123 return -1;
123 } 124 }
124 printk("OK.\n"); 125 printk("OK.\n");
125 126
@@ -130,10 +131,6 @@ static int __init check_nmi_watchdog(void)
130 131
131 kfree(prev_nmi_count); 132 kfree(prev_nmi_count);
132 return 0; 133 return 0;
133error:
134 timer_ack = !cpu_has_tsc;
135
136 return -1;
137} 134}
138/* This needs to happen later in boot so counters are working */ 135/* This needs to happen later in boot so counters are working */
139late_initcall(check_nmi_watchdog); 136late_initcall(check_nmi_watchdog);
@@ -181,7 +178,7 @@ static int lapic_nmi_resume(struct sys_device *dev)
181 178
182 179
183static struct sysdev_class nmi_sysclass = { 180static struct sysdev_class nmi_sysclass = {
184 set_kset_name("lapic_nmi"), 181 .name = "lapic_nmi",
185 .resume = lapic_nmi_resume, 182 .resume = lapic_nmi_resume,
186 .suspend = lapic_nmi_suspend, 183 .suspend = lapic_nmi_suspend,
187}; 184};
@@ -242,10 +239,10 @@ void acpi_nmi_disable(void)
242 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); 239 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
243} 240}
244 241
245void setup_apic_nmi_watchdog (void *unused) 242void setup_apic_nmi_watchdog(void *unused)
246{ 243{
247 if (__get_cpu_var(wd_enabled)) 244 if (__get_cpu_var(wd_enabled))
248 return; 245 return;
249 246
250 /* cheap hack to support suspend/resume */ 247 /* cheap hack to support suspend/resume */
251 /* if cpu0 is not active neither should the other cpus */ 248 /* if cpu0 is not active neither should the other cpus */
@@ -334,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
334 unsigned int sum; 331 unsigned int sum;
335 int touched = 0; 332 int touched = 0;
336 int cpu = smp_processor_id(); 333 int cpu = smp_processor_id();
337 int rc=0; 334 int rc = 0;
338 335
339 /* check for other users first */ 336 /* check for other users first */
340 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 337 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 4253c4e8849c..fb99484d21cf 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE;
39 * 0: the lapic NMI watchdog is disabled, but can be enabled 39 * 0: the lapic NMI watchdog is disabled, but can be enabled
40 */ 40 */
41atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 41atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
42int panic_on_timeout; 42static int panic_on_timeout;
43 43
44unsigned int nmi_watchdog = NMI_DEFAULT; 44unsigned int nmi_watchdog = NMI_DEFAULT;
45static unsigned int nmi_hz = HZ; 45static unsigned int nmi_hz = HZ;
@@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data)
78} 78}
79#endif 79#endif
80 80
81int __init check_nmi_watchdog (void) 81int __init check_nmi_watchdog(void)
82{ 82{
83 int *counts; 83 int *prev_nmi_count;
84 int cpu; 84 int cpu;
85 85
86 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) 86 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED))
87 return 0; 87 return 0;
88 88
89 if (!atomic_read(&nmi_active)) 89 if (!atomic_read(&nmi_active))
90 return 0; 90 return 0;
91 91
92 counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); 92 prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
93 if (!counts) 93 if (!prev_nmi_count)
94 return -1; 94 return -1;
95 95
96 printk(KERN_INFO "testing NMI watchdog ... "); 96 printk(KERN_INFO "Testing NMI watchdog ... ");
97 97
98#ifdef CONFIG_SMP 98#ifdef CONFIG_SMP
99 if (nmi_watchdog == NMI_LOCAL_APIC) 99 if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void)
101#endif 101#endif
102 102
103 for (cpu = 0; cpu < NR_CPUS; cpu++) 103 for (cpu = 0; cpu < NR_CPUS; cpu++)
104 counts[cpu] = cpu_pda(cpu)->__nmi_count; 104 prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count;
105 local_irq_enable(); 105 local_irq_enable();
106 mdelay((20*1000)/nmi_hz); // wait 20 ticks 106 mdelay((20*1000)/nmi_hz); // wait 20 ticks
107 107
108 for_each_online_cpu(cpu) { 108 for_each_online_cpu(cpu) {
109 if (!per_cpu(wd_enabled, cpu)) 109 if (!per_cpu(wd_enabled, cpu))
110 continue; 110 continue;
111 if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { 111 if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) {
112 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 112 printk(KERN_WARNING "WARNING: CPU#%d: NMI "
113 "appears to be stuck (%d->%d)!\n", 113 "appears to be stuck (%d->%d)!\n",
114 cpu, 114 cpu,
115 counts[cpu], 115 prev_nmi_count[cpu],
116 cpu_pda(cpu)->__nmi_count); 116 cpu_pda(cpu)->__nmi_count);
117 per_cpu(wd_enabled, cpu) = 0; 117 per_cpu(wd_enabled, cpu) = 0;
118 atomic_dec(&nmi_active); 118 atomic_dec(&nmi_active);
119 } 119 }
120 } 120 }
121 endflag = 1;
121 if (!atomic_read(&nmi_active)) { 122 if (!atomic_read(&nmi_active)) {
122 kfree(counts); 123 kfree(prev_nmi_count);
123 atomic_set(&nmi_active, -1); 124 atomic_set(&nmi_active, -1);
124 endflag = 1;
125 return -1; 125 return -1;
126 } 126 }
127 endflag = 1;
128 printk("OK.\n"); 127 printk("OK.\n");
129 128
130 /* now that we know it works we can reduce NMI frequency to 129 /* now that we know it works we can reduce NMI frequency to
@@ -132,11 +131,11 @@ int __init check_nmi_watchdog (void)
132 if (nmi_watchdog == NMI_LOCAL_APIC) 131 if (nmi_watchdog == NMI_LOCAL_APIC)
133 nmi_hz = lapic_adjust_nmi_hz(1); 132 nmi_hz = lapic_adjust_nmi_hz(1);
134 133
135 kfree(counts); 134 kfree(prev_nmi_count);
136 return 0; 135 return 0;
137} 136}
138 137
139int __init setup_nmi_watchdog(char *str) 138static int __init setup_nmi_watchdog(char *str)
140{ 139{
141 int nmi; 140 int nmi;
142 141
@@ -159,34 +158,6 @@ int __init setup_nmi_watchdog(char *str)
159 158
160__setup("nmi_watchdog=", setup_nmi_watchdog); 159__setup("nmi_watchdog=", setup_nmi_watchdog);
161 160
162
163static void __acpi_nmi_disable(void *__unused)
164{
165 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
166}
167
168/*
169 * Disable timer based NMIs on all CPUs:
170 */
171void acpi_nmi_disable(void)
172{
173 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
174 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
175}
176
177static void __acpi_nmi_enable(void *__unused)
178{
179 apic_write(APIC_LVT0, APIC_DM_NMI);
180}
181
182/*
183 * Enable timer based NMIs on all CPUs:
184 */
185void acpi_nmi_enable(void)
186{
187 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
188 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
189}
190#ifdef CONFIG_PM 161#ifdef CONFIG_PM
191 162
192static int nmi_pm_active; /* nmi_active before suspend */ 163static int nmi_pm_active; /* nmi_active before suspend */
@@ -211,13 +182,13 @@ static int lapic_nmi_resume(struct sys_device *dev)
211} 182}
212 183
213static struct sysdev_class nmi_sysclass = { 184static struct sysdev_class nmi_sysclass = {
214 set_kset_name("lapic_nmi"), 185 .name = "lapic_nmi",
215 .resume = lapic_nmi_resume, 186 .resume = lapic_nmi_resume,
216 .suspend = lapic_nmi_suspend, 187 .suspend = lapic_nmi_suspend,
217}; 188};
218 189
219static struct sys_device device_lapic_nmi = { 190static struct sys_device device_lapic_nmi = {
220 .id = 0, 191 .id = 0,
221 .cls = &nmi_sysclass, 192 .cls = &nmi_sysclass,
222}; 193};
223 194
@@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void)
231 if (nmi_watchdog != NMI_LOCAL_APIC) 202 if (nmi_watchdog != NMI_LOCAL_APIC)
232 return 0; 203 return 0;
233 204
234 if ( atomic_read(&nmi_active) < 0 ) 205 if (atomic_read(&nmi_active) < 0)
235 return 0; 206 return 0;
236 207
237 error = sysdev_class_register(&nmi_sysclass); 208 error = sysdev_class_register(&nmi_sysclass);
@@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs);
244 215
245#endif /* CONFIG_PM */ 216#endif /* CONFIG_PM */
246 217
218static void __acpi_nmi_enable(void *__unused)
219{
220 apic_write(APIC_LVT0, APIC_DM_NMI);
221}
222
223/*
224 * Enable timer based NMIs on all CPUs:
225 */
226void acpi_nmi_enable(void)
227{
228 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
229 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
230}
231
232static void __acpi_nmi_disable(void *__unused)
233{
234 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
235}
236
237/*
238 * Disable timer based NMIs on all CPUs:
239 */
240void acpi_nmi_disable(void)
241{
242 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
243 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
244}
245
247void setup_apic_nmi_watchdog(void *unused) 246void setup_apic_nmi_watchdog(void *unused)
248{ 247{
249 if (__get_cpu_var(wd_enabled) == 1) 248 if (__get_cpu_var(wd_enabled))
250 return; 249 return;
251 250
252 /* cheap hack to support suspend/resume */ 251 /* cheap hack to support suspend/resume */
@@ -311,8 +310,9 @@ void touch_nmi_watchdog(void)
311 } 310 }
312 } 311 }
313 312
314 touch_softlockup_watchdog(); 313 touch_softlockup_watchdog();
315} 314}
315EXPORT_SYMBOL(touch_nmi_watchdog);
316 316
317int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) 317int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
318{ 318{
@@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void)
479 479
480EXPORT_SYMBOL(nmi_active); 480EXPORT_SYMBOL(nmi_active);
481EXPORT_SYMBOL(nmi_watchdog); 481EXPORT_SYMBOL(nmi_watchdog);
482EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 9000d82c6dc0..e65281b1634b 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void)
82{ 82{
83 if (num_online_nodes() > 1) { 83 if (num_online_nodes() > 1) {
84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); 84 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
85 tsc_disable = 1; 85 setup_clear_cpu_cap(X86_FEATURE_TSC);
86 } 86 }
87 return 0; 87 return 0;
88} 88}
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt.c
index f5000799f8ef..075962cc75ab 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt.c
@@ -14,7 +14,10 @@
14 You should have received a copy of the GNU General Public License 14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc
17*/ 19*/
20
18#include <linux/errno.h> 21#include <linux/errno.h>
19#include <linux/module.h> 22#include <linux/module.h>
20#include <linux/efi.h> 23#include <linux/efi.h>
@@ -55,59 +58,9 @@ char *memory_setup(void)
55 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 58 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
56 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 59 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
57 60
58DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
59DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
60DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
61DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
62DEF_NATIVE(pv_cpu_ops, iret, "iret");
63DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(pv_cpu_ops, clts, "clts");
68DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
69
70/* Undefined instruction for dealing with missing ops pointers. */ 61/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b }; 62static const unsigned char ud2a[] = { 0x0f, 0x0b };
72 63
73static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
74 unsigned long addr, unsigned len)
75{
76 const unsigned char *start, *end;
77 unsigned ret;
78
79 switch(type) {
80#define SITE(ops, x) \
81 case PARAVIRT_PATCH(ops.x): \
82 start = start_##ops##_##x; \
83 end = end_##ops##_##x; \
84 goto patch_site
85
86 SITE(pv_irq_ops, irq_disable);
87 SITE(pv_irq_ops, irq_enable);
88 SITE(pv_irq_ops, restore_fl);
89 SITE(pv_irq_ops, save_fl);
90 SITE(pv_cpu_ops, iret);
91 SITE(pv_cpu_ops, irq_enable_sysexit);
92 SITE(pv_mmu_ops, read_cr2);
93 SITE(pv_mmu_ops, read_cr3);
94 SITE(pv_mmu_ops, write_cr3);
95 SITE(pv_cpu_ops, clts);
96 SITE(pv_cpu_ops, read_tsc);
97#undef SITE
98
99 patch_site:
100 ret = paravirt_patch_insns(ibuf, len, start, end);
101 break;
102
103 default:
104 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
105 break;
106 }
107
108 return ret;
109}
110
111unsigned paravirt_patch_nop(void) 64unsigned paravirt_patch_nop(void)
112{ 65{
113 return 0; 66 return 0;
@@ -186,7 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
186 /* If the operation is a nop, then nop the callsite */ 139 /* If the operation is a nop, then nop the callsite */
187 ret = paravirt_patch_nop(); 140 ret = paravirt_patch_nop();
188 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 141 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
189 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) 142 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
190 /* If operation requires a jmp, then jmp */ 143 /* If operation requires a jmp, then jmp */
191 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); 144 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
192 else 145 else
@@ -237,7 +190,7 @@ static void native_flush_tlb_single(unsigned long addr)
237 190
238/* These are in entry.S */ 191/* These are in entry.S */
239extern void native_iret(void); 192extern void native_iret(void);
240extern void native_irq_enable_sysexit(void); 193extern void native_irq_enable_syscall_ret(void);
241 194
242static int __init print_banner(void) 195static int __init print_banner(void)
243{ 196{
@@ -285,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
285 238
286static inline void enter_lazy(enum paravirt_lazy_mode mode) 239static inline void enter_lazy(enum paravirt_lazy_mode mode)
287{ 240{
288 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 241 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
289 BUG_ON(preemptible()); 242 BUG_ON(preemptible());
290 243
291 x86_write_percpu(paravirt_lazy_mode, mode); 244 __get_cpu_var(paravirt_lazy_mode) = mode;
292} 245}
293 246
294void paravirt_leave_lazy(enum paravirt_lazy_mode mode) 247void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
295{ 248{
296 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); 249 BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
297 BUG_ON(preemptible()); 250 BUG_ON(preemptible());
298 251
299 x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); 252 __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
300} 253}
301 254
302void paravirt_enter_lazy_mmu(void) 255void paravirt_enter_lazy_mmu(void)
@@ -321,7 +274,7 @@ void paravirt_leave_lazy_cpu(void)
321 274
322enum paravirt_lazy_mode paravirt_get_lazy_mode(void) 275enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
323{ 276{
324 return x86_read_percpu(paravirt_lazy_mode); 277 return __get_cpu_var(paravirt_lazy_mode);
325} 278}
326 279
327struct pv_info pv_info = { 280struct pv_info pv_info = {
@@ -366,11 +319,16 @@ struct pv_cpu_ops pv_cpu_ops = {
366 .read_cr4 = native_read_cr4, 319 .read_cr4 = native_read_cr4,
367 .read_cr4_safe = native_read_cr4_safe, 320 .read_cr4_safe = native_read_cr4_safe,
368 .write_cr4 = native_write_cr4, 321 .write_cr4 = native_write_cr4,
322#ifdef CONFIG_X86_64
323 .read_cr8 = native_read_cr8,
324 .write_cr8 = native_write_cr8,
325#endif
369 .wbinvd = native_wbinvd, 326 .wbinvd = native_wbinvd,
370 .read_msr = native_read_msr_safe, 327 .read_msr = native_read_msr_safe,
371 .write_msr = native_write_msr_safe, 328 .write_msr = native_write_msr_safe,
372 .read_tsc = native_read_tsc, 329 .read_tsc = native_read_tsc,
373 .read_pmc = native_read_pmc, 330 .read_pmc = native_read_pmc,
331 .read_tscp = native_read_tscp,
374 .load_tr_desc = native_load_tr_desc, 332 .load_tr_desc = native_load_tr_desc,
375 .set_ldt = native_set_ldt, 333 .set_ldt = native_set_ldt,
376 .load_gdt = native_load_gdt, 334 .load_gdt = native_load_gdt,
@@ -379,13 +337,14 @@ struct pv_cpu_ops pv_cpu_ops = {
379 .store_idt = native_store_idt, 337 .store_idt = native_store_idt,
380 .store_tr = native_store_tr, 338 .store_tr = native_store_tr,
381 .load_tls = native_load_tls, 339 .load_tls = native_load_tls,
382 .write_ldt_entry = write_dt_entry, 340 .write_ldt_entry = native_write_ldt_entry,
383 .write_gdt_entry = write_dt_entry, 341 .write_gdt_entry = native_write_gdt_entry,
384 .write_idt_entry = write_dt_entry, 342 .write_idt_entry = native_write_idt_entry,
385 .load_esp0 = native_load_esp0, 343 .load_sp0 = native_load_sp0,
386 344
387 .irq_enable_sysexit = native_irq_enable_sysexit, 345 .irq_enable_syscall_ret = native_irq_enable_syscall_ret,
388 .iret = native_iret, 346 .iret = native_iret,
347 .swapgs = native_swapgs,
389 348
390 .set_iopl_mask = native_set_iopl_mask, 349 .set_iopl_mask = native_set_iopl_mask,
391 .io_delay = native_io_delay, 350 .io_delay = native_io_delay,
@@ -408,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = {
408}; 367};
409 368
410struct pv_mmu_ops pv_mmu_ops = { 369struct pv_mmu_ops pv_mmu_ops = {
370#ifndef CONFIG_X86_64
411 .pagetable_setup_start = native_pagetable_setup_start, 371 .pagetable_setup_start = native_pagetable_setup_start,
412 .pagetable_setup_done = native_pagetable_setup_done, 372 .pagetable_setup_done = native_pagetable_setup_done,
373#endif
413 374
414 .read_cr2 = native_read_cr2, 375 .read_cr2 = native_read_cr2,
415 .write_cr2 = native_write_cr2, 376 .write_cr2 = native_write_cr2,
@@ -437,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = {
437 .kmap_atomic_pte = kmap_atomic, 398 .kmap_atomic_pte = kmap_atomic,
438#endif 399#endif
439 400
401#if PAGETABLE_LEVELS >= 3
440#ifdef CONFIG_X86_PAE 402#ifdef CONFIG_X86_PAE
441 .set_pte_atomic = native_set_pte_atomic, 403 .set_pte_atomic = native_set_pte_atomic,
442 .set_pte_present = native_set_pte_present, 404 .set_pte_present = native_set_pte_present,
443 .set_pud = native_set_pud,
444 .pte_clear = native_pte_clear, 405 .pte_clear = native_pte_clear,
445 .pmd_clear = native_pmd_clear, 406 .pmd_clear = native_pmd_clear,
446 407#endif
408 .set_pud = native_set_pud,
447 .pmd_val = native_pmd_val, 409 .pmd_val = native_pmd_val,
448 .make_pmd = native_make_pmd, 410 .make_pmd = native_make_pmd,
411
412#if PAGETABLE_LEVELS == 4
413 .pud_val = native_pud_val,
414 .make_pud = native_make_pud,
415 .set_pgd = native_set_pgd,
449#endif 416#endif
417#endif /* PAGETABLE_LEVELS >= 3 */
450 418
451 .pte_val = native_pte_val, 419 .pte_val = native_pte_val,
452 .pgd_val = native_pgd_val, 420 .pgd_val = native_pgd_val,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
new file mode 100644
index 000000000000..82fc5fcab4f4
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -0,0 +1,49 @@
1#include <asm/paravirt.h>
2
3DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
4DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
5DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
6DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
7DEF_NATIVE(pv_cpu_ops, iret, "iret");
8DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
9DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
10DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len)
17{
18 const unsigned char *start, *end;
19 unsigned ret;
20
21#define PATCH_SITE(ops, x) \
22 case PARAVIRT_PATCH(ops.x): \
23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \
25 goto patch_site
26 switch(type) {
27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl);
30 PATCH_SITE(pv_irq_ops, save_fl);
31 PATCH_SITE(pv_cpu_ops, iret);
32 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
33 PATCH_SITE(pv_mmu_ops, read_cr2);
34 PATCH_SITE(pv_mmu_ops, read_cr3);
35 PATCH_SITE(pv_mmu_ops, write_cr3);
36 PATCH_SITE(pv_cpu_ops, clts);
37 PATCH_SITE(pv_cpu_ops, read_tsc);
38
39 patch_site:
40 ret = paravirt_patch_insns(ibuf, len, start, end);
41 break;
42
43 default:
44 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
45 break;
46 }
47#undef PATCH_SITE
48 return ret;
49}
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
new file mode 100644
index 000000000000..7d904e138d7e
--- /dev/null
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -0,0 +1,57 @@
1#include <asm/paravirt.h>
2#include <asm/asm-offsets.h>
3#include <linux/stringify.h>
4
5DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
6DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
7DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
8DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
9DEF_NATIVE(pv_cpu_ops, iret, "iretq");
10DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
11DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
12DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
13DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
14DEF_NATIVE(pv_cpu_ops, clts, "clts");
15DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
16
17/* the three commands give us more control to how to return from a syscall */
18DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;");
19DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
20
21unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
22 unsigned long addr, unsigned len)
23{
24 const unsigned char *start, *end;
25 unsigned ret;
26
27#define PATCH_SITE(ops, x) \
28 case PARAVIRT_PATCH(ops.x): \
29 start = start_##ops##_##x; \
30 end = end_##ops##_##x; \
31 goto patch_site
32 switch(type) {
33 PATCH_SITE(pv_irq_ops, restore_fl);
34 PATCH_SITE(pv_irq_ops, save_fl);
35 PATCH_SITE(pv_irq_ops, irq_enable);
36 PATCH_SITE(pv_irq_ops, irq_disable);
37 PATCH_SITE(pv_cpu_ops, iret);
38 PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
39 PATCH_SITE(pv_cpu_ops, swapgs);
40 PATCH_SITE(pv_mmu_ops, read_cr2);
41 PATCH_SITE(pv_mmu_ops, read_cr3);
42 PATCH_SITE(pv_mmu_ops, write_cr3);
43 PATCH_SITE(pv_cpu_ops, clts);
44 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
45 PATCH_SITE(pv_cpu_ops, wbinvd);
46
47 patch_site:
48 ret = paravirt_patch_insns(ibuf, len, start, end);
49 break;
50
51 default:
52 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
53 break;
54 }
55#undef PATCH_SITE
56 return ret;
57}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6bf1f716909d..1b5464c2434f 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -30,12 +30,12 @@
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/dma-mapping.h> 32#include <linux/dma-mapping.h>
33#include <linux/init.h>
34#include <linux/bitops.h> 33#include <linux/bitops.h>
35#include <linux/pci_ids.h> 34#include <linux/pci_ids.h>
36#include <linux/pci.h> 35#include <linux/pci.h>
37#include <linux/delay.h> 36#include <linux/delay.h>
38#include <linux/scatterlist.h> 37#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h>
39#include <asm/gart.h> 39#include <asm/gart.h>
40#include <asm/calgary.h> 40#include <asm/calgary.h>
41#include <asm/tce.h> 41#include <asm/tce.h>
@@ -183,7 +183,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
183 183
184/* enable this to stress test the chip's TCE cache */ 184/* enable this to stress test the chip's TCE cache */
185#ifdef CONFIG_IOMMU_DEBUG 185#ifdef CONFIG_IOMMU_DEBUG
186int debugging __read_mostly = 1; 186static int debugging = 1;
187 187
188static inline unsigned long verify_bit_range(unsigned long* bitmap, 188static inline unsigned long verify_bit_range(unsigned long* bitmap,
189 int expected, unsigned long start, unsigned long end) 189 int expected, unsigned long start, unsigned long end)
@@ -202,7 +202,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
202 return ~0UL; 202 return ~0UL;
203} 203}
204#else /* debugging is disabled */ 204#else /* debugging is disabled */
205int debugging __read_mostly = 0; 205static int debugging;
206 206
207static inline unsigned long verify_bit_range(unsigned long* bitmap, 207static inline unsigned long verify_bit_range(unsigned long* bitmap,
208 int expected, unsigned long start, unsigned long end) 208 int expected, unsigned long start, unsigned long end)
@@ -261,22 +261,28 @@ static void iommu_range_reserve(struct iommu_table *tbl,
261 spin_unlock_irqrestore(&tbl->it_lock, flags); 261 spin_unlock_irqrestore(&tbl->it_lock, flags);
262} 262}
263 263
264static unsigned long iommu_range_alloc(struct iommu_table *tbl, 264static unsigned long iommu_range_alloc(struct device *dev,
265 unsigned int npages) 265 struct iommu_table *tbl,
266 unsigned int npages)
266{ 267{
267 unsigned long flags; 268 unsigned long flags;
268 unsigned long offset; 269 unsigned long offset;
270 unsigned long boundary_size;
271
272 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
273 PAGE_SIZE) >> PAGE_SHIFT;
269 274
270 BUG_ON(npages == 0); 275 BUG_ON(npages == 0);
271 276
272 spin_lock_irqsave(&tbl->it_lock, flags); 277 spin_lock_irqsave(&tbl->it_lock, flags);
273 278
274 offset = find_next_zero_string(tbl->it_map, tbl->it_hint, 279 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint,
275 tbl->it_size, npages); 280 npages, 0, boundary_size, 0);
276 if (offset == ~0UL) { 281 if (offset == ~0UL) {
277 tbl->chip_ops->tce_cache_blast(tbl); 282 tbl->chip_ops->tce_cache_blast(tbl);
278 offset = find_next_zero_string(tbl->it_map, 0, 283
279 tbl->it_size, npages); 284 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
285 npages, 0, boundary_size, 0);
280 if (offset == ~0UL) { 286 if (offset == ~0UL) {
281 printk(KERN_WARNING "Calgary: IOMMU full.\n"); 287 printk(KERN_WARNING "Calgary: IOMMU full.\n");
282 spin_unlock_irqrestore(&tbl->it_lock, flags); 288 spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -287,7 +293,6 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
287 } 293 }
288 } 294 }
289 295
290 set_bit_string(tbl->it_map, offset, npages);
291 tbl->it_hint = offset + npages; 296 tbl->it_hint = offset + npages;
292 BUG_ON(tbl->it_hint > tbl->it_size); 297 BUG_ON(tbl->it_hint > tbl->it_size);
293 298
@@ -296,13 +301,13 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl,
296 return offset; 301 return offset;
297} 302}
298 303
299static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, 304static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
300 unsigned int npages, int direction) 305 void *vaddr, unsigned int npages, int direction)
301{ 306{
302 unsigned long entry; 307 unsigned long entry;
303 dma_addr_t ret = bad_dma_address; 308 dma_addr_t ret = bad_dma_address;
304 309
305 entry = iommu_range_alloc(tbl, npages); 310 entry = iommu_range_alloc(dev, tbl, npages);
306 311
307 if (unlikely(entry == bad_dma_address)) 312 if (unlikely(entry == bad_dma_address))
308 goto error; 313 goto error;
@@ -355,7 +360,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
355 badbit, tbl, dma_addr, entry, npages); 360 badbit, tbl, dma_addr, entry, npages);
356 } 361 }
357 362
358 __clear_bit_string(tbl->it_map, entry, npages); 363 iommu_area_free(tbl->it_map, entry, npages);
359 364
360 spin_unlock_irqrestore(&tbl->it_lock, flags); 365 spin_unlock_irqrestore(&tbl->it_lock, flags);
361} 366}
@@ -439,7 +444,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
439 vaddr = (unsigned long) sg_virt(s); 444 vaddr = (unsigned long) sg_virt(s);
440 npages = num_dma_pages(vaddr, s->length); 445 npages = num_dma_pages(vaddr, s->length);
441 446
442 entry = iommu_range_alloc(tbl, npages); 447 entry = iommu_range_alloc(dev, tbl, npages);
443 if (entry == bad_dma_address) { 448 if (entry == bad_dma_address) {
444 /* makes sure unmap knows to stop */ 449 /* makes sure unmap knows to stop */
445 s->dma_length = 0; 450 s->dma_length = 0;
@@ -477,7 +482,7 @@ static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
477 npages = num_dma_pages(uaddr, size); 482 npages = num_dma_pages(uaddr, size);
478 483
479 if (translation_enabled(tbl)) 484 if (translation_enabled(tbl))
480 dma_handle = iommu_alloc(tbl, vaddr, npages, direction); 485 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
481 else 486 else
482 dma_handle = virt_to_bus(vaddr); 487 dma_handle = virt_to_bus(vaddr);
483 488
@@ -517,7 +522,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
517 522
518 if (translation_enabled(tbl)) { 523 if (translation_enabled(tbl)) {
519 /* set up tces to cover the allocated range */ 524 /* set up tces to cover the allocated range */
520 mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); 525 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
521 if (mapping == bad_dma_address) 526 if (mapping == bad_dma_address)
522 goto free; 527 goto free;
523 528
@@ -1007,7 +1012,7 @@ static void __init calgary_set_split_completion_timeout(void __iomem *bbar,
1007 readq(target); /* flush */ 1012 readq(target); /* flush */
1008} 1013}
1009 1014
1010static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) 1015static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1011{ 1016{
1012 unsigned char busnum = dev->bus->number; 1017 unsigned char busnum = dev->bus->number;
1013 void __iomem *bbar = tbl->bbar; 1018 void __iomem *bbar = tbl->bbar;
@@ -1023,7 +1028,7 @@ static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1023 writel(cpu_to_be32(val), target); 1028 writel(cpu_to_be32(val), target);
1024} 1029}
1025 1030
1026static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) 1031static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev)
1027{ 1032{
1028 unsigned char busnum = dev->bus->number; 1033 unsigned char busnum = dev->bus->number;
1029 1034
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c
index 5552d23d23c2..a82473d192a3 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma_64.c
@@ -13,7 +13,6 @@
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14 14
15int iommu_merge __read_mostly = 0; 15int iommu_merge __read_mostly = 0;
16EXPORT_SYMBOL(iommu_merge);
17 16
18dma_addr_t bad_dma_address __read_mostly; 17dma_addr_t bad_dma_address __read_mostly;
19EXPORT_SYMBOL(bad_dma_address); 18EXPORT_SYMBOL(bad_dma_address);
@@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask);
230 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 229 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
231 * documentation. 230 * documentation.
232 */ 231 */
233__init int iommu_setup(char *p) 232static __init int iommu_setup(char *p)
234{ 233{
235 iommu_merge = 1; 234 iommu_merge = 1;
236 235
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 06bcba536045..65f6acb025c8 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * Dynamic DMA mapping support for AMD Hammer. 2 * Dynamic DMA mapping support for AMD Hammer.
3 * 3 *
4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. 4 * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
5 * This allows to use PCI devices that only support 32bit addresses on systems 5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB. 6 * with more than 4GB.
7 * 7 *
8 * See Documentation/DMA-mapping.txt for the interface specification. 8 * See Documentation/DMA-mapping.txt for the interface specification.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 * Subject to the GNU General Public License v2 only. 11 * Subject to the GNU General Public License v2 only.
12 */ 12 */
@@ -25,6 +25,7 @@
25#include <linux/bitops.h> 25#include <linux/bitops.h>
26#include <linux/kdebug.h> 26#include <linux/kdebug.h>
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h>
28#include <asm/atomic.h> 29#include <asm/atomic.h>
29#include <asm/io.h> 30#include <asm/io.h>
30#include <asm/mtrr.h> 31#include <asm/mtrr.h>
@@ -37,23 +38,26 @@
37#include <asm/k8.h> 38#include <asm/k8.h>
38 39
39static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 40static unsigned long iommu_bus_base; /* GART remapping area (physical) */
40static unsigned long iommu_size; /* size of remapping area bytes */ 41static unsigned long iommu_size; /* size of remapping area bytes */
41static unsigned long iommu_pages; /* .. and in pages */ 42static unsigned long iommu_pages; /* .. and in pages */
42 43
43static u32 *iommu_gatt_base; /* Remapping table */ 44static u32 *iommu_gatt_base; /* Remapping table */
44 45
45/* If this is disabled the IOMMU will use an optimized flushing strategy 46/*
46 of only flushing when an mapping is reused. With it true the GART is flushed 47 * If this is disabled the IOMMU will use an optimized flushing strategy
47 for every mapping. Problem is that doing the lazy flush seems to trigger 48 * of only flushing when an mapping is reused. With it true the GART is
48 bugs with some popular PCI cards, in particular 3ware (but has been also 49 * flushed for every mapping. Problem is that doing the lazy flush seems
49 also seen with Qlogic at least). */ 50 * to trigger bugs with some popular PCI cards, in particular 3ware (but
51 * has been also also seen with Qlogic at least).
52 */
50int iommu_fullflush = 1; 53int iommu_fullflush = 1;
51 54
52/* Allocation bitmap for the remapping area */ 55/* Allocation bitmap for the remapping area: */
53static DEFINE_SPINLOCK(iommu_bitmap_lock); 56static DEFINE_SPINLOCK(iommu_bitmap_lock);
54static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ 57/* Guarded by iommu_bitmap_lock: */
58static unsigned long *iommu_gart_bitmap;
55 59
56static u32 gart_unmapped_entry; 60static u32 gart_unmapped_entry;
57 61
58#define GPTE_VALID 1 62#define GPTE_VALID 1
59#define GPTE_COHERENT 2 63#define GPTE_COHERENT 2
@@ -61,10 +65,10 @@ static u32 gart_unmapped_entry;
61 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 65 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
62#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 66#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
63 67
64#define to_pages(addr,size) \ 68#define to_pages(addr, size) \
65 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) 69 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
66 70
67#define EMERGENCY_PAGES 32 /* = 128KB */ 71#define EMERGENCY_PAGES 32 /* = 128KB */
68 72
69#ifdef CONFIG_AGP 73#ifdef CONFIG_AGP
70#define AGPEXTERN extern 74#define AGPEXTERN extern
@@ -77,130 +81,159 @@ AGPEXTERN int agp_memory_reserved;
77AGPEXTERN __u32 *agp_gatt_table; 81AGPEXTERN __u32 *agp_gatt_table;
78 82
79static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 83static unsigned long next_bit; /* protected by iommu_bitmap_lock */
80static int need_flush; /* global flush state. set for each gart wrap */ 84static int need_flush; /* global flush state. set for each gart wrap */
81 85
82static unsigned long alloc_iommu(int size) 86static unsigned long alloc_iommu(struct device *dev, int size)
83{ 87{
84 unsigned long offset, flags; 88 unsigned long offset, flags;
89 unsigned long boundary_size;
90 unsigned long base_index;
91
92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
93 PAGE_SIZE) >> PAGE_SHIFT;
94 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
95 PAGE_SIZE) >> PAGE_SHIFT;
85 96
86 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
87 offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
99 size, base_index, boundary_size, 0);
88 if (offset == -1) { 100 if (offset == -1) {
89 need_flush = 1; 101 need_flush = 1;
90 offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
103 size, base_index, boundary_size, 0);
91 } 104 }
92 if (offset != -1) { 105 if (offset != -1) {
93 set_bit_string(iommu_gart_bitmap, offset, size); 106 set_bit_string(iommu_gart_bitmap, offset, size);
94 next_bit = offset+size; 107 next_bit = offset+size;
95 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
96 next_bit = 0; 109 next_bit = 0;
97 need_flush = 1; 110 need_flush = 1;
98 } 111 }
99 } 112 }
100 if (iommu_fullflush) 113 if (iommu_fullflush)
101 need_flush = 1; 114 need_flush = 1;
102 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
116
103 return offset; 117 return offset;
104} 118}
105 119
106static void free_iommu(unsigned long offset, int size) 120static void free_iommu(unsigned long offset, int size)
107{ 121{
108 unsigned long flags; 122 unsigned long flags;
123
109 spin_lock_irqsave(&iommu_bitmap_lock, flags); 124 spin_lock_irqsave(&iommu_bitmap_lock, flags);
110 __clear_bit_string(iommu_gart_bitmap, offset, size); 125 iommu_area_free(iommu_gart_bitmap, offset, size);
111 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 126 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
112} 127}
113 128
114/* 129/*
115 * Use global flush state to avoid races with multiple flushers. 130 * Use global flush state to avoid races with multiple flushers.
116 */ 131 */
117static void flush_gart(void) 132static void flush_gart(void)
118{ 133{
119 unsigned long flags; 134 unsigned long flags;
135
120 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
121 if (need_flush) { 137 if (need_flush) {
122 k8_flush_garts(); 138 k8_flush_garts();
123 need_flush = 0; 139 need_flush = 0;
124 } 140 }
125 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
126} 142}
127 143
128#ifdef CONFIG_IOMMU_LEAK 144#ifdef CONFIG_IOMMU_LEAK
129 145
130#define SET_LEAK(x) if (iommu_leak_tab) \ 146#define SET_LEAK(x) \
131 iommu_leak_tab[x] = __builtin_return_address(0); 147 do { \
132#define CLEAR_LEAK(x) if (iommu_leak_tab) \ 148 if (iommu_leak_tab) \
133 iommu_leak_tab[x] = NULL; 149 iommu_leak_tab[x] = __builtin_return_address(0);\
150 } while (0)
151
152#define CLEAR_LEAK(x) \
153 do { \
154 if (iommu_leak_tab) \
155 iommu_leak_tab[x] = NULL; \
156 } while (0)
134 157
135/* Debugging aid for drivers that don't free their IOMMU tables */ 158/* Debugging aid for drivers that don't free their IOMMU tables */
136static void **iommu_leak_tab; 159static void **iommu_leak_tab;
137static int leak_trace; 160static int leak_trace;
138static int iommu_leak_pages = 20; 161static int iommu_leak_pages = 20;
162
139static void dump_leak(void) 163static void dump_leak(void)
140{ 164{
141 int i; 165 int i;
142 static int dump; 166 static int dump;
143 if (dump || !iommu_leak_tab) return; 167
168 if (dump || !iommu_leak_tab)
169 return;
144 dump = 1; 170 dump = 1;
145 show_stack(NULL,NULL); 171 show_stack(NULL, NULL);
146 /* Very crude. dump some from the end of the table too */ 172
147 printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); 173 /* Very crude. dump some from the end of the table too */
148 for (i = 0; i < iommu_leak_pages; i+=2) { 174 printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
149 printk("%lu: ", iommu_pages-i); 175 iommu_leak_pages);
150 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); 176 for (i = 0; i < iommu_leak_pages; i += 2) {
151 printk("%c", (i+1)%2 == 0 ? '\n' : ' '); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
152 } 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
153 printk("\n"); 179 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
180 }
181 printk(KERN_DEBUG "\n");
154} 182}
155#else 183#else
156#define SET_LEAK(x) 184# define SET_LEAK(x)
157#define CLEAR_LEAK(x) 185# define CLEAR_LEAK(x)
158#endif 186#endif
159 187
160static void iommu_full(struct device *dev, size_t size, int dir) 188static void iommu_full(struct device *dev, size_t size, int dir)
161{ 189{
162 /* 190 /*
163 * Ran out of IOMMU space for this operation. This is very bad. 191 * Ran out of IOMMU space for this operation. This is very bad.
164 * Unfortunately the drivers cannot handle this operation properly. 192 * Unfortunately the drivers cannot handle this operation properly.
165 * Return some non mapped prereserved space in the aperture and 193 * Return some non mapped prereserved space in the aperture and
166 * let the Northbridge deal with it. This will result in garbage 194 * let the Northbridge deal with it. This will result in garbage
167 * in the IO operation. When the size exceeds the prereserved space 195 * in the IO operation. When the size exceeds the prereserved space
168 * memory corruption will occur or random memory will be DMAed 196 * memory corruption will occur or random memory will be DMAed
169 * out. Hopefully no network devices use single mappings that big. 197 * out. Hopefully no network devices use single mappings that big.
170 */ 198 */
171 199
172 printk(KERN_ERR 200 printk(KERN_ERR
173 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", 201 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
174 size, dev->bus_id); 202 size, dev->bus_id);
175 203
176 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 204 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
177 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 205 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
178 panic("PCI-DMA: Memory would be corrupted\n"); 206 panic("PCI-DMA: Memory would be corrupted\n");
179 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) 207 if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
180 panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); 208 panic(KERN_ERR
181 } 209 "PCI-DMA: Random memory would be DMAed\n");
182 210 }
183#ifdef CONFIG_IOMMU_LEAK 211#ifdef CONFIG_IOMMU_LEAK
184 dump_leak(); 212 dump_leak();
185#endif 213#endif
186} 214}
187 215
188static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) 216static inline int
189{ 217need_iommu(struct device *dev, unsigned long addr, size_t size)
218{
190 u64 mask = *dev->dma_mask; 219 u64 mask = *dev->dma_mask;
191 int high = addr + size > mask; 220 int high = addr + size > mask;
192 int mmu = high; 221 int mmu = high;
193 if (force_iommu) 222
194 mmu = 1; 223 if (force_iommu)
195 return mmu; 224 mmu = 1;
225
226 return mmu;
196} 227}
197 228
198static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 229static inline int
199{ 230nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
231{
200 u64 mask = *dev->dma_mask; 232 u64 mask = *dev->dma_mask;
201 int high = addr + size > mask; 233 int high = addr + size > mask;
202 int mmu = high; 234 int mmu = high;
203 return mmu; 235
236 return mmu;
204} 237}
205 238
206/* Map a single continuous physical area into the IOMMU. 239/* Map a single continuous physical area into the IOMMU.
@@ -208,13 +241,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t
208 */ 241 */
209static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 242static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
210 size_t size, int dir) 243 size_t size, int dir)
211{ 244{
212 unsigned long npages = to_pages(phys_mem, size); 245 unsigned long npages = to_pages(phys_mem, size);
213 unsigned long iommu_page = alloc_iommu(npages); 246 unsigned long iommu_page = alloc_iommu(dev, npages);
214 int i; 247 int i;
248
215 if (iommu_page == -1) { 249 if (iommu_page == -1) {
216 if (!nonforced_iommu(dev, phys_mem, size)) 250 if (!nonforced_iommu(dev, phys_mem, size))
217 return phys_mem; 251 return phys_mem;
218 if (panic_on_overflow) 252 if (panic_on_overflow)
219 panic("dma_map_area overflow %lu bytes\n", size); 253 panic("dma_map_area overflow %lu bytes\n", size);
220 iommu_full(dev, size, dir); 254 iommu_full(dev, size, dir);
@@ -229,35 +263,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
229 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 263 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
230} 264}
231 265
232static dma_addr_t gart_map_simple(struct device *dev, char *buf, 266static dma_addr_t
233 size_t size, int dir) 267gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
234{ 268{
235 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); 269 dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
270
236 flush_gart(); 271 flush_gart();
272
237 return map; 273 return map;
238} 274}
239 275
240/* Map a single area into the IOMMU */ 276/* Map a single area into the IOMMU */
241static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) 277static dma_addr_t
278gart_map_single(struct device *dev, void *addr, size_t size, int dir)
242{ 279{
243 unsigned long phys_mem, bus; 280 unsigned long phys_mem, bus;
244 281
245 if (!dev) 282 if (!dev)
246 dev = &fallback_dev; 283 dev = &fallback_dev;
247 284
248 phys_mem = virt_to_phys(addr); 285 phys_mem = virt_to_phys(addr);
249 if (!need_iommu(dev, phys_mem, size)) 286 if (!need_iommu(dev, phys_mem, size))
250 return phys_mem; 287 return phys_mem;
251 288
252 bus = gart_map_simple(dev, addr, size, dir); 289 bus = gart_map_simple(dev, addr, size, dir);
253 return bus; 290
291 return bus;
254} 292}
255 293
256/* 294/*
257 * Free a DMA mapping. 295 * Free a DMA mapping.
258 */ 296 */
259static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, 297static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
260 size_t size, int direction) 298 size_t size, int direction)
261{ 299{
262 unsigned long iommu_page; 300 unsigned long iommu_page;
263 int npages; 301 int npages;
@@ -266,6 +304,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
266 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || 304 if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
267 dma_addr >= iommu_bus_base + iommu_size) 305 dma_addr >= iommu_bus_base + iommu_size)
268 return; 306 return;
307
269 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 308 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
270 npages = to_pages(dma_addr, size); 309 npages = to_pages(dma_addr, size);
271 for (i = 0; i < npages; i++) { 310 for (i = 0; i < npages; i++) {
@@ -278,7 +317,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
278/* 317/*
279 * Wrapper for pci_unmap_single working with scatterlists. 318 * Wrapper for pci_unmap_single working with scatterlists.
280 */ 319 */
281static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) 320static void
321gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
282{ 322{
283 struct scatterlist *s; 323 struct scatterlist *s;
284 int i; 324 int i;
@@ -303,12 +343,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
303 343
304 for_each_sg(sg, s, nents, i) { 344 for_each_sg(sg, s, nents, i) {
305 unsigned long addr = sg_phys(s); 345 unsigned long addr = sg_phys(s);
306 if (nonforced_iommu(dev, addr, s->length)) { 346
347 if (nonforced_iommu(dev, addr, s->length)) {
307 addr = dma_map_area(dev, addr, s->length, dir); 348 addr = dma_map_area(dev, addr, s->length, dir);
308 if (addr == bad_dma_address) { 349 if (addr == bad_dma_address) {
309 if (i > 0) 350 if (i > 0)
310 gart_unmap_sg(dev, sg, i, dir); 351 gart_unmap_sg(dev, sg, i, dir);
311 nents = 0; 352 nents = 0;
312 sg[0].dma_length = 0; 353 sg[0].dma_length = 0;
313 break; 354 break;
314 } 355 }
@@ -317,15 +358,17 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
317 s->dma_length = s->length; 358 s->dma_length = s->length;
318 } 359 }
319 flush_gart(); 360 flush_gart();
361
320 return nents; 362 return nents;
321} 363}
322 364
323/* Map multiple scatterlist entries continuous into the first. */ 365/* Map multiple scatterlist entries continuous into the first. */
324static int __dma_map_cont(struct scatterlist *start, int nelems, 366static int __dma_map_cont(struct device *dev, struct scatterlist *start,
325 struct scatterlist *sout, unsigned long pages) 367 int nelems, struct scatterlist *sout,
368 unsigned long pages)
326{ 369{
327 unsigned long iommu_start = alloc_iommu(pages); 370 unsigned long iommu_start = alloc_iommu(dev, pages);
328 unsigned long iommu_page = iommu_start; 371 unsigned long iommu_page = iommu_start;
329 struct scatterlist *s; 372 struct scatterlist *s;
330 int i; 373 int i;
331 374
@@ -335,32 +378,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems,
335 for_each_sg(start, s, nelems, i) { 378 for_each_sg(start, s, nelems, i) {
336 unsigned long pages, addr; 379 unsigned long pages, addr;
337 unsigned long phys_addr = s->dma_address; 380 unsigned long phys_addr = s->dma_address;
338 381
339 BUG_ON(s != start && s->offset); 382 BUG_ON(s != start && s->offset);
340 if (s == start) { 383 if (s == start) {
341 sout->dma_address = iommu_bus_base; 384 sout->dma_address = iommu_bus_base;
342 sout->dma_address += iommu_page*PAGE_SIZE + s->offset; 385 sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
343 sout->dma_length = s->length; 386 sout->dma_length = s->length;
344 } else { 387 } else {
345 sout->dma_length += s->length; 388 sout->dma_length += s->length;
346 } 389 }
347 390
348 addr = phys_addr; 391 addr = phys_addr;
349 pages = to_pages(s->offset, s->length); 392 pages = to_pages(s->offset, s->length);
350 while (pages--) { 393 while (pages--) {
351 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 394 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
352 SET_LEAK(iommu_page); 395 SET_LEAK(iommu_page);
353 addr += PAGE_SIZE; 396 addr += PAGE_SIZE;
354 iommu_page++; 397 iommu_page++;
355 } 398 }
356 } 399 }
357 BUG_ON(iommu_page - iommu_start != pages); 400 BUG_ON(iommu_page - iommu_start != pages);
401
358 return 0; 402 return 0;
359} 403}
360 404
361static inline int dma_map_cont(struct scatterlist *start, int nelems, 405static inline int
362 struct scatterlist *sout, 406dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
363 unsigned long pages, int need) 407 struct scatterlist *sout, unsigned long pages, int need)
364{ 408{
365 if (!need) { 409 if (!need) {
366 BUG_ON(nelems != 1); 410 BUG_ON(nelems != 1);
@@ -368,24 +412,23 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems,
368 sout->dma_length = start->length; 412 sout->dma_length = start->length;
369 return 0; 413 return 0;
370 } 414 }
371 return __dma_map_cont(start, nelems, sout, pages); 415 return __dma_map_cont(dev, start, nelems, sout, pages);
372} 416}
373 417
374/* 418/*
375 * DMA map all entries in a scatterlist. 419 * DMA map all entries in a scatterlist.
376 * Merge chunks that have page aligned sizes into a continuous mapping. 420 * Merge chunks that have page aligned sizes into a continuous mapping.
377 */ 421 */
378static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, 422static int
379 int dir) 423gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
380{ 424{
381 int i;
382 int out;
383 int start;
384 unsigned long pages = 0;
385 int need = 0, nextneed;
386 struct scatterlist *s, *ps, *start_sg, *sgmap; 425 struct scatterlist *s, *ps, *start_sg, *sgmap;
426 int need = 0, nextneed, i, out, start;
427 unsigned long pages = 0;
428 unsigned int seg_size;
429 unsigned int max_seg_size;
387 430
388 if (nents == 0) 431 if (nents == 0)
389 return 0; 432 return 0;
390 433
391 if (!dev) 434 if (!dev)
@@ -394,24 +437,32 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
394 out = 0; 437 out = 0;
395 start = 0; 438 start = 0;
396 start_sg = sgmap = sg; 439 start_sg = sgmap = sg;
440 seg_size = 0;
441 max_seg_size = dma_get_max_seg_size(dev);
397 ps = NULL; /* shut up gcc */ 442 ps = NULL; /* shut up gcc */
398 for_each_sg(sg, s, nents, i) { 443 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 444 dma_addr_t addr = sg_phys(s);
445
400 s->dma_address = addr; 446 s->dma_address = addr;
401 BUG_ON(s->length == 0); 447 BUG_ON(s->length == 0);
402 448
403 nextneed = need_iommu(dev, addr, s->length); 449 nextneed = need_iommu(dev, addr, s->length);
404 450
405 /* Handle the previous not yet processed entries */ 451 /* Handle the previous not yet processed entries */
406 if (i > start) { 452 if (i > start) {
407 /* Can only merge when the last chunk ends on a page 453 /*
408 boundary and the new one doesn't have an offset. */ 454 * Can only merge when the last chunk ends on a
455 * page boundary and the new one doesn't have an
456 * offset.
457 */
409 if (!iommu_merge || !nextneed || !need || s->offset || 458 if (!iommu_merge || !nextneed || !need || s->offset ||
459 (s->length + seg_size > max_seg_size) ||
410 (ps->offset + ps->length) % PAGE_SIZE) { 460 (ps->offset + ps->length) % PAGE_SIZE) {
411 if (dma_map_cont(start_sg, i - start, sgmap, 461 if (dma_map_cont(dev, start_sg, i - start,
412 pages, need) < 0) 462 sgmap, pages, need) < 0)
413 goto error; 463 goto error;
414 out++; 464 out++;
465 seg_size = 0;
415 sgmap = sg_next(sgmap); 466 sgmap = sg_next(sgmap);
416 pages = 0; 467 pages = 0;
417 start = i; 468 start = i;
@@ -419,11 +470,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
419 } 470 }
420 } 471 }
421 472
473 seg_size += s->length;
422 need = nextneed; 474 need = nextneed;
423 pages += to_pages(s->offset, s->length); 475 pages += to_pages(s->offset, s->length);
424 ps = s; 476 ps = s;
425 } 477 }
426 if (dma_map_cont(start_sg, i - start, sgmap, pages, need) < 0) 478 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
427 goto error; 479 goto error;
428 out++; 480 out++;
429 flush_gart(); 481 flush_gart();
@@ -436,6 +488,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
436error: 488error:
437 flush_gart(); 489 flush_gart();
438 gart_unmap_sg(dev, sg, out, dir); 490 gart_unmap_sg(dev, sg, out, dir);
491
439 /* When it was forced or merged try again in a dumb way */ 492 /* When it was forced or merged try again in a dumb way */
440 if (force_iommu || iommu_merge) { 493 if (force_iommu || iommu_merge) {
441 out = dma_map_sg_nonforce(dev, sg, nents, dir); 494 out = dma_map_sg_nonforce(dev, sg, nents, dir);
@@ -444,64 +497,68 @@ error:
444 } 497 }
445 if (panic_on_overflow) 498 if (panic_on_overflow)
446 panic("dma_map_sg: overflow on %lu pages\n", pages); 499 panic("dma_map_sg: overflow on %lu pages\n", pages);
500
447 iommu_full(dev, pages << PAGE_SHIFT, dir); 501 iommu_full(dev, pages << PAGE_SHIFT, dir);
448 for_each_sg(sg, s, nents, i) 502 for_each_sg(sg, s, nents, i)
449 s->dma_address = bad_dma_address; 503 s->dma_address = bad_dma_address;
450 return 0; 504 return 0;
451} 505}
452 506
453static int no_agp; 507static int no_agp;
454 508
455static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 509static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
456{ 510{
457 unsigned long a; 511 unsigned long a;
458 if (!iommu_size) { 512
459 iommu_size = aper_size; 513 if (!iommu_size) {
460 if (!no_agp) 514 iommu_size = aper_size;
461 iommu_size /= 2; 515 if (!no_agp)
462 } 516 iommu_size /= 2;
463 517 }
464 a = aper + iommu_size; 518
465 iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; 519 a = aper + iommu_size;
466 520 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
467 if (iommu_size < 64*1024*1024) 521
522 if (iommu_size < 64*1024*1024) {
468 printk(KERN_WARNING 523 printk(KERN_WARNING
469 "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); 524 "PCI-DMA: Warning: Small IOMMU %luMB."
470 525 " Consider increasing the AGP aperture in BIOS\n",
526 iommu_size >> 20);
527 }
528
471 return iommu_size; 529 return iommu_size;
472} 530}
473 531
474static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) 532static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
475{ 533{
476 unsigned aper_size = 0, aper_base_32; 534 unsigned aper_size = 0, aper_base_32, aper_order;
477 u64 aper_base; 535 u64 aper_base;
478 unsigned aper_order;
479 536
480 pci_read_config_dword(dev, 0x94, &aper_base_32); 537 pci_read_config_dword(dev, 0x94, &aper_base_32);
481 pci_read_config_dword(dev, 0x90, &aper_order); 538 pci_read_config_dword(dev, 0x90, &aper_order);
482 aper_order = (aper_order >> 1) & 7; 539 aper_order = (aper_order >> 1) & 7;
483 540
484 aper_base = aper_base_32 & 0x7fff; 541 aper_base = aper_base_32 & 0x7fff;
485 aper_base <<= 25; 542 aper_base <<= 25;
486 543
487 aper_size = (32 * 1024 * 1024) << aper_order; 544 aper_size = (32 * 1024 * 1024) << aper_order;
488 if (aper_base + aper_size > 0x100000000UL || !aper_size) 545 if (aper_base + aper_size > 0x100000000UL || !aper_size)
489 aper_base = 0; 546 aper_base = 0;
490 547
491 *size = aper_size; 548 *size = aper_size;
492 return aper_base; 549 return aper_base;
493} 550}
494 551
495/* 552/*
496 * Private Northbridge GATT initialization in case we cannot use the 553 * Private Northbridge GATT initialization in case we cannot use the
497 * AGP driver for some reason. 554 * AGP driver for some reason.
498 */ 555 */
499static __init int init_k8_gatt(struct agp_kern_info *info) 556static __init int init_k8_gatt(struct agp_kern_info *info)
500{ 557{
558 unsigned aper_size, gatt_size, new_aper_size;
559 unsigned aper_base, new_aper_base;
501 struct pci_dev *dev; 560 struct pci_dev *dev;
502 void *gatt; 561 void *gatt;
503 unsigned aper_base, new_aper_base;
504 unsigned aper_size, gatt_size, new_aper_size;
505 int i; 562 int i;
506 563
507 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 564 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
@@ -509,75 +566,75 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
509 dev = NULL; 566 dev = NULL;
510 for (i = 0; i < num_k8_northbridges; i++) { 567 for (i = 0; i < num_k8_northbridges; i++) {
511 dev = k8_northbridges[i]; 568 dev = k8_northbridges[i];
512 new_aper_base = read_aperture(dev, &new_aper_size); 569 new_aper_base = read_aperture(dev, &new_aper_size);
513 if (!new_aper_base) 570 if (!new_aper_base)
514 goto nommu; 571 goto nommu;
515 572
516 if (!aper_base) { 573 if (!aper_base) {
517 aper_size = new_aper_size; 574 aper_size = new_aper_size;
518 aper_base = new_aper_base; 575 aper_base = new_aper_base;
519 } 576 }
520 if (aper_size != new_aper_size || aper_base != new_aper_base) 577 if (aper_size != new_aper_size || aper_base != new_aper_base)
521 goto nommu; 578 goto nommu;
522 } 579 }
523 if (!aper_base) 580 if (!aper_base)
524 goto nommu; 581 goto nommu;
525 info->aper_base = aper_base; 582 info->aper_base = aper_base;
526 info->aper_size = aper_size>>20; 583 info->aper_size = aper_size >> 20;
527 584
528 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 585 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
529 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 586 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
530 if (!gatt) 587 if (!gatt)
531 panic("Cannot allocate GATT table"); 588 panic("Cannot allocate GATT table");
532 if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) 589 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
533 panic("Could not set GART PTEs to uncacheable pages"); 590 panic("Could not set GART PTEs to uncacheable pages");
534 global_flush_tlb();
535 591
536 memset(gatt, 0, gatt_size); 592 memset(gatt, 0, gatt_size);
537 agp_gatt_table = gatt; 593 agp_gatt_table = gatt;
538 594
539 for (i = 0; i < num_k8_northbridges; i++) { 595 for (i = 0; i < num_k8_northbridges; i++) {
540 u32 ctl; 596 u32 gatt_reg;
541 u32 gatt_reg; 597 u32 ctl;
542 598
543 dev = k8_northbridges[i]; 599 dev = k8_northbridges[i];
544 gatt_reg = __pa(gatt) >> 12; 600 gatt_reg = __pa(gatt) >> 12;
545 gatt_reg <<= 4; 601 gatt_reg <<= 4;
546 pci_write_config_dword(dev, 0x98, gatt_reg); 602 pci_write_config_dword(dev, 0x98, gatt_reg);
547 pci_read_config_dword(dev, 0x90, &ctl); 603 pci_read_config_dword(dev, 0x90, &ctl);
548 604
549 ctl |= 1; 605 ctl |= 1;
550 ctl &= ~((1<<4) | (1<<5)); 606 ctl &= ~((1<<4) | (1<<5));
551 607
552 pci_write_config_dword(dev, 0x90, ctl); 608 pci_write_config_dword(dev, 0x90, ctl);
553 } 609 }
554 flush_gart(); 610 flush_gart();
555 611
556 printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); 612 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
613 aper_base, aper_size>>10);
557 return 0; 614 return 0;
558 615
559 nommu: 616 nommu:
560 /* Should not happen anymore */ 617 /* Should not happen anymore */
561 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 618 printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n"
562 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); 619 KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n");
563 return -1; 620 return -1;
564} 621}
565 622
566extern int agp_amd64_init(void); 623extern int agp_amd64_init(void);
567 624
568static const struct dma_mapping_ops gart_dma_ops = { 625static const struct dma_mapping_ops gart_dma_ops = {
569 .mapping_error = NULL, 626 .mapping_error = NULL,
570 .map_single = gart_map_single, 627 .map_single = gart_map_single,
571 .map_simple = gart_map_simple, 628 .map_simple = gart_map_simple,
572 .unmap_single = gart_unmap_single, 629 .unmap_single = gart_unmap_single,
573 .sync_single_for_cpu = NULL, 630 .sync_single_for_cpu = NULL,
574 .sync_single_for_device = NULL, 631 .sync_single_for_device = NULL,
575 .sync_single_range_for_cpu = NULL, 632 .sync_single_range_for_cpu = NULL,
576 .sync_single_range_for_device = NULL, 633 .sync_single_range_for_device = NULL,
577 .sync_sg_for_cpu = NULL, 634 .sync_sg_for_cpu = NULL,
578 .sync_sg_for_device = NULL, 635 .sync_sg_for_device = NULL,
579 .map_sg = gart_map_sg, 636 .map_sg = gart_map_sg,
580 .unmap_sg = gart_unmap_sg, 637 .unmap_sg = gart_unmap_sg,
581}; 638};
582 639
583void gart_iommu_shutdown(void) 640void gart_iommu_shutdown(void)
@@ -588,23 +645,23 @@ void gart_iommu_shutdown(void)
588 if (no_agp && (dma_ops != &gart_dma_ops)) 645 if (no_agp && (dma_ops != &gart_dma_ops))
589 return; 646 return;
590 647
591 for (i = 0; i < num_k8_northbridges; i++) { 648 for (i = 0; i < num_k8_northbridges; i++) {
592 u32 ctl; 649 u32 ctl;
593 650
594 dev = k8_northbridges[i]; 651 dev = k8_northbridges[i];
595 pci_read_config_dword(dev, 0x90, &ctl); 652 pci_read_config_dword(dev, 0x90, &ctl);
596 653
597 ctl &= ~1; 654 ctl &= ~1;
598 655
599 pci_write_config_dword(dev, 0x90, ctl); 656 pci_write_config_dword(dev, 0x90, ctl);
600 } 657 }
601} 658}
602 659
603void __init gart_iommu_init(void) 660void __init gart_iommu_init(void)
604{ 661{
605 struct agp_kern_info info; 662 struct agp_kern_info info;
606 unsigned long aper_size;
607 unsigned long iommu_start; 663 unsigned long iommu_start;
664 unsigned long aper_size;
608 unsigned long scratch; 665 unsigned long scratch;
609 long i; 666 long i;
610 667
@@ -614,14 +671,14 @@ void __init gart_iommu_init(void)
614 } 671 }
615 672
616#ifndef CONFIG_AGP_AMD64 673#ifndef CONFIG_AGP_AMD64
617 no_agp = 1; 674 no_agp = 1;
618#else 675#else
619 /* Makefile puts PCI initialization via subsys_initcall first. */ 676 /* Makefile puts PCI initialization via subsys_initcall first. */
620 /* Add other K8 AGP bridge drivers here */ 677 /* Add other K8 AGP bridge drivers here */
621 no_agp = no_agp || 678 no_agp = no_agp ||
622 (agp_amd64_init() < 0) || 679 (agp_amd64_init() < 0) ||
623 (agp_copy_info(agp_bridge, &info) < 0); 680 (agp_copy_info(agp_bridge, &info) < 0);
624#endif 681#endif
625 682
626 if (swiotlb) 683 if (swiotlb)
627 return; 684 return;
@@ -643,77 +700,79 @@ void __init gart_iommu_init(void)
643 } 700 }
644 701
645 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 702 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
646 aper_size = info.aper_size * 1024 * 1024; 703 aper_size = info.aper_size * 1024 * 1024;
647 iommu_size = check_iommu_size(info.aper_base, aper_size); 704 iommu_size = check_iommu_size(info.aper_base, aper_size);
648 iommu_pages = iommu_size >> PAGE_SHIFT; 705 iommu_pages = iommu_size >> PAGE_SHIFT;
649 706
650 iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, 707 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
651 get_order(iommu_pages/8)); 708 get_order(iommu_pages/8));
652 if (!iommu_gart_bitmap) 709 if (!iommu_gart_bitmap)
653 panic("Cannot allocate iommu bitmap\n"); 710 panic("Cannot allocate iommu bitmap\n");
654 memset(iommu_gart_bitmap, 0, iommu_pages/8); 711 memset(iommu_gart_bitmap, 0, iommu_pages/8);
655 712
656#ifdef CONFIG_IOMMU_LEAK 713#ifdef CONFIG_IOMMU_LEAK
657 if (leak_trace) { 714 if (leak_trace) {
658 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 715 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
659 get_order(iommu_pages*sizeof(void *))); 716 get_order(iommu_pages*sizeof(void *)));
660 if (iommu_leak_tab) 717 if (iommu_leak_tab)
661 memset(iommu_leak_tab, 0, iommu_pages * 8); 718 memset(iommu_leak_tab, 0, iommu_pages * 8);
662 else 719 else
663 printk("PCI-DMA: Cannot allocate leak trace area\n"); 720 printk(KERN_DEBUG
664 } 721 "PCI-DMA: Cannot allocate leak trace area\n");
722 }
665#endif 723#endif
666 724
667 /* 725 /*
668 * Out of IOMMU space handling. 726 * Out of IOMMU space handling.
669 * Reserve some invalid pages at the beginning of the GART. 727 * Reserve some invalid pages at the beginning of the GART.
670 */ 728 */
671 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 729 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
672 730
673 agp_memory_reserved = iommu_size; 731 agp_memory_reserved = iommu_size;
674 printk(KERN_INFO 732 printk(KERN_INFO
675 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", 733 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
676 iommu_size>>20); 734 iommu_size >> 20);
677 735
678 iommu_start = aper_size - iommu_size; 736 iommu_start = aper_size - iommu_size;
679 iommu_bus_base = info.aper_base + iommu_start; 737 iommu_bus_base = info.aper_base + iommu_start;
680 bad_dma_address = iommu_bus_base; 738 bad_dma_address = iommu_bus_base;
681 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 739 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
682 740
683 /* 741 /*
684 * Unmap the IOMMU part of the GART. The alias of the page is 742 * Unmap the IOMMU part of the GART. The alias of the page is
685 * always mapped with cache enabled and there is no full cache 743 * always mapped with cache enabled and there is no full cache
686 * coherency across the GART remapping. The unmapping avoids 744 * coherency across the GART remapping. The unmapping avoids
687 * automatic prefetches from the CPU allocating cache lines in 745 * automatic prefetches from the CPU allocating cache lines in
688 * there. All CPU accesses are done via the direct mapping to 746 * there. All CPU accesses are done via the direct mapping to
689 * the backing memory. The GART address is only used by PCI 747 * the backing memory. The GART address is only used by PCI
690 * devices. 748 * devices.
691 */ 749 */
692 clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); 750 set_memory_np((unsigned long)__va(iommu_bus_base),
751 iommu_size >> PAGE_SHIFT);
693 752
694 /* 753 /*
695 * Try to workaround a bug (thanks to BenH) 754 * Try to workaround a bug (thanks to BenH)
696 * Set unmapped entries to a scratch page instead of 0. 755 * Set unmapped entries to a scratch page instead of 0.
697 * Any prefetches that hit unmapped entries won't get an bus abort 756 * Any prefetches that hit unmapped entries won't get an bus abort
698 * then. 757 * then.
699 */ 758 */
700 scratch = get_zeroed_page(GFP_KERNEL); 759 scratch = get_zeroed_page(GFP_KERNEL);
701 if (!scratch) 760 if (!scratch)
702 panic("Cannot allocate iommu scratch page"); 761 panic("Cannot allocate iommu scratch page");
703 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); 762 gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
704 for (i = EMERGENCY_PAGES; i < iommu_pages; i++) 763 for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
705 iommu_gatt_base[i] = gart_unmapped_entry; 764 iommu_gatt_base[i] = gart_unmapped_entry;
706 765
707 flush_gart(); 766 flush_gart();
708 dma_ops = &gart_dma_ops; 767 dma_ops = &gart_dma_ops;
709} 768}
710 769
711void __init gart_parse_options(char *p) 770void __init gart_parse_options(char *p)
712{ 771{
713 int arg; 772 int arg;
714 773
715#ifdef CONFIG_IOMMU_LEAK 774#ifdef CONFIG_IOMMU_LEAK
716 if (!strncmp(p,"leak",4)) { 775 if (!strncmp(p, "leak", 4)) {
717 leak_trace = 1; 776 leak_trace = 1;
718 p += 4; 777 p += 4;
719 if (*p == '=') ++p; 778 if (*p == '=') ++p;
@@ -723,18 +782,18 @@ void __init gart_parse_options(char *p)
723#endif 782#endif
724 if (isdigit(*p) && get_option(&p, &arg)) 783 if (isdigit(*p) && get_option(&p, &arg))
725 iommu_size = arg; 784 iommu_size = arg;
726 if (!strncmp(p, "fullflush",8)) 785 if (!strncmp(p, "fullflush", 8))
727 iommu_fullflush = 1; 786 iommu_fullflush = 1;
728 if (!strncmp(p, "nofullflush",11)) 787 if (!strncmp(p, "nofullflush", 11))
729 iommu_fullflush = 0; 788 iommu_fullflush = 0;
730 if (!strncmp(p,"noagp",5)) 789 if (!strncmp(p, "noagp", 5))
731 no_agp = 1; 790 no_agp = 1;
732 if (!strncmp(p, "noaperture",10)) 791 if (!strncmp(p, "noaperture", 10))
733 fix_aperture = 0; 792 fix_aperture = 0;
734 /* duplicated from pci-dma.c */ 793 /* duplicated from pci-dma.c */
735 if (!strncmp(p,"force",5)) 794 if (!strncmp(p, "force", 5))
736 gart_iommu_aperture_allowed = 1; 795 gart_iommu_aperture_allowed = 1;
737 if (!strncmp(p,"allowed",7)) 796 if (!strncmp(p, "allowed", 7))
738 gart_iommu_aperture_allowed = 1; 797 gart_iommu_aperture_allowed = 1;
739 if (!strncmp(p, "memaper", 7)) { 798 if (!strncmp(p, "memaper", 7)) {
740 fallback_aper_force = 1; 799 fallback_aper_force = 1;
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 102866d729a5..82a0a674a003 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -10,7 +10,6 @@
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
12int swiotlb __read_mostly; 12int swiotlb __read_mostly;
13EXPORT_SYMBOL(swiotlb);
14 13
15const struct dma_mapping_ops swiotlb_dma_ops = { 14const struct dma_mapping_ops swiotlb_dma_ops = {
16 .mapping_error = swiotlb_dma_mapping_error, 15 .mapping_error = swiotlb_dma_mapping_error,
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
index ae8f91214f15..b112406f1996 100644
--- a/arch/x86/kernel/pmtimer_64.c
+++ b/arch/x86/kernel/pmtimer_64.c
@@ -19,13 +19,13 @@
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/cpumask.h> 21#include <linux/cpumask.h>
22#include <linux/acpi_pmtmr.h>
23
22#include <asm/io.h> 24#include <asm/io.h>
23#include <asm/proto.h> 25#include <asm/proto.h>
24#include <asm/msr.h> 26#include <asm/msr.h>
25#include <asm/vsyscall.h> 27#include <asm/vsyscall.h>
26 28
27#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
28
29static inline u32 cyc2us(u32 cycles) 29static inline u32 cyc2us(u32 cycles)
30{ 30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond. 31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9663c2a74830..dabdbeff1f77 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,6 +55,7 @@
55 55
56#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
57#include <asm/cpu.h> 57#include <asm/cpu.h>
58#include <asm/kdebug.h>
58 59
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 61
@@ -74,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
74 */ 75 */
75unsigned long thread_saved_pc(struct task_struct *tsk) 76unsigned long thread_saved_pc(struct task_struct *tsk)
76{ 77{
77 return ((unsigned long *)tsk->thread.esp)[3]; 78 return ((unsigned long *)tsk->thread.sp)[3];
78} 79}
79 80
80/* 81/*
@@ -113,10 +114,19 @@ void default_idle(void)
113 smp_mb(); 114 smp_mb();
114 115
115 local_irq_disable(); 116 local_irq_disable();
116 if (!need_resched()) 117 if (!need_resched()) {
118 ktime_t t0, t1;
119 u64 t0n, t1n;
120
121 t0 = ktime_get();
122 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */ 123 safe_halt(); /* enables interrupts racelessly */
118 else 124 local_irq_disable();
119 local_irq_enable(); 125 t1 = ktime_get();
126 t1n = ktime_to_ns(t1);
127 sched_clock_idle_wakeup_event(t1n - t0n);
128 }
129 local_irq_enable();
120 current_thread_info()->status |= TS_POLLING; 130 current_thread_info()->status |= TS_POLLING;
121 } else { 131 } else {
122 /* loop is done by the caller */ 132 /* loop is done by the caller */
@@ -132,7 +142,7 @@ EXPORT_SYMBOL(default_idle);
132 * to poll the ->work.need_resched flag instead of waiting for the 142 * to poll the ->work.need_resched flag instead of waiting for the
133 * cross-CPU IPI to arrive. Use this option with caution. 143 * cross-CPU IPI to arrive. Use this option with caution.
134 */ 144 */
135static void poll_idle (void) 145static void poll_idle(void)
136{ 146{
137 cpu_relax(); 147 cpu_relax();
138} 148}
@@ -188,6 +198,9 @@ void cpu_idle(void)
188 rmb(); 198 rmb();
189 idle = pm_idle; 199 idle = pm_idle;
190 200
201 if (rcu_pending(cpu))
202 rcu_check_callbacks(cpu, 0);
203
191 if (!idle) 204 if (!idle)
192 idle = default_idle; 205 idle = default_idle;
193 206
@@ -204,6 +217,10 @@ void cpu_idle(void)
204 } 217 }
205} 218}
206 219
220static void do_nothing(void *unused)
221{
222}
223
207void cpu_idle_wait(void) 224void cpu_idle_wait(void)
208{ 225{
209 unsigned int cpu, this_cpu = get_cpu(); 226 unsigned int cpu, this_cpu = get_cpu();
@@ -228,6 +245,13 @@ void cpu_idle_wait(void)
228 cpu_clear(cpu, map); 245 cpu_clear(cpu, map);
229 } 246 }
230 cpus_and(map, map, cpu_online_map); 247 cpus_and(map, map, cpu_online_map);
248 /*
249 * We waited 1 sec, if a CPU still did not call idle
250 * it may be because it is in idle and not waking up
251 * because it has nothing to do.
252 * Give all the remaining CPUS a kick.
253 */
254 smp_call_function_mask(map, do_nothing, NULL, 0);
231 } while (!cpus_empty(map)); 255 } while (!cpus_empty(map));
232 256
233 set_cpus_allowed(current, tmp); 257 set_cpus_allowed(current, tmp);
@@ -244,13 +268,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
244 * New with Core Duo processors, MWAIT can take some hints based on CPU 268 * New with Core Duo processors, MWAIT can take some hints based on CPU
245 * capability. 269 * capability.
246 */ 270 */
247void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 271void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
248{ 272{
249 if (!need_resched()) { 273 if (!need_resched()) {
250 __monitor((void *)&current_thread_info()->flags, 0, 0); 274 __monitor((void *)&current_thread_info()->flags, 0, 0);
251 smp_mb(); 275 smp_mb();
252 if (!need_resched()) 276 if (!need_resched())
253 __mwait(eax, ecx); 277 __mwait(ax, cx);
254 } 278 }
255} 279}
256 280
@@ -261,19 +285,37 @@ static void mwait_idle(void)
261 mwait_idle_with_hints(0, 0); 285 mwait_idle_with_hints(0, 0);
262} 286}
263 287
288static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
289{
290 if (force_mwait)
291 return 1;
292 /* Any C1 states supported? */
293 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
294}
295
264void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 296void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
265{ 297{
266 if (cpu_has(c, X86_FEATURE_MWAIT)) { 298 static int selected;
267 printk("monitor/mwait feature present.\n"); 299
300 if (selected)
301 return;
302#ifdef CONFIG_X86_SMP
303 if (pm_idle == poll_idle && smp_num_siblings > 1) {
304 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
305 " performance may degrade.\n");
306 }
307#endif
308 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
268 /* 309 /*
269 * Skip, if setup has overridden idle. 310 * Skip, if setup has overridden idle.
270 * One CPU supports mwait => All CPUs supports mwait 311 * One CPU supports mwait => All CPUs supports mwait
271 */ 312 */
272 if (!pm_idle) { 313 if (!pm_idle) {
273 printk("using mwait in idle threads.\n"); 314 printk(KERN_INFO "using mwait in idle threads.\n");
274 pm_idle = mwait_idle; 315 pm_idle = mwait_idle;
275 } 316 }
276 } 317 }
318 selected = 1;
277} 319}
278 320
279static int __init idle_setup(char *str) 321static int __init idle_setup(char *str)
@@ -281,10 +323,6 @@ static int __init idle_setup(char *str)
281 if (!strcmp(str, "poll")) { 323 if (!strcmp(str, "poll")) {
282 printk("using polling idle threads.\n"); 324 printk("using polling idle threads.\n");
283 pm_idle = poll_idle; 325 pm_idle = poll_idle;
284#ifdef CONFIG_X86_SMP
285 if (smp_num_siblings > 1)
286 printk("WARNING: polling idle and HT enabled, performance may degrade.\n");
287#endif
288 } else if (!strcmp(str, "mwait")) 326 } else if (!strcmp(str, "mwait"))
289 force_mwait = 1; 327 force_mwait = 1;
290 else 328 else
@@ -299,15 +337,15 @@ void __show_registers(struct pt_regs *regs, int all)
299{ 337{
300 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 338 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
301 unsigned long d0, d1, d2, d3, d6, d7; 339 unsigned long d0, d1, d2, d3, d6, d7;
302 unsigned long esp; 340 unsigned long sp;
303 unsigned short ss, gs; 341 unsigned short ss, gs;
304 342
305 if (user_mode_vm(regs)) { 343 if (user_mode_vm(regs)) {
306 esp = regs->esp; 344 sp = regs->sp;
307 ss = regs->xss & 0xffff; 345 ss = regs->ss & 0xffff;
308 savesegment(gs, gs); 346 savesegment(gs, gs);
309 } else { 347 } else {
310 esp = (unsigned long) (&regs->esp); 348 sp = (unsigned long) (&regs->sp);
311 savesegment(ss, ss); 349 savesegment(ss, ss);
312 savesegment(gs, gs); 350 savesegment(gs, gs);
313 } 351 }
@@ -320,17 +358,17 @@ void __show_registers(struct pt_regs *regs, int all)
320 init_utsname()->version); 358 init_utsname()->version);
321 359
322 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 360 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
323 0xffff & regs->xcs, regs->eip, regs->eflags, 361 0xffff & regs->cs, regs->ip, regs->flags,
324 smp_processor_id()); 362 smp_processor_id());
325 print_symbol("EIP is at %s\n", regs->eip); 363 print_symbol("EIP is at %s\n", regs->ip);
326 364
327 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 365 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
328 regs->eax, regs->ebx, regs->ecx, regs->edx); 366 regs->ax, regs->bx, regs->cx, regs->dx);
329 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 367 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
330 regs->esi, regs->edi, regs->ebp, esp); 368 regs->si, regs->di, regs->bp, sp);
331 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 369 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
332 regs->xds & 0xffff, regs->xes & 0xffff, 370 regs->ds & 0xffff, regs->es & 0xffff,
333 regs->xfs & 0xffff, gs, ss); 371 regs->fs & 0xffff, gs, ss);
334 372
335 if (!all) 373 if (!all)
336 return; 374 return;
@@ -358,12 +396,12 @@ void __show_registers(struct pt_regs *regs, int all)
358void show_regs(struct pt_regs *regs) 396void show_regs(struct pt_regs *regs)
359{ 397{
360 __show_registers(regs, 1); 398 __show_registers(regs, 1);
361 show_trace(NULL, regs, &regs->esp); 399 show_trace(NULL, regs, &regs->sp, regs->bp);
362} 400}
363 401
364/* 402/*
365 * This gets run with %ebx containing the 403 * This gets run with %bx containing the
366 * function to call, and %edx containing 404 * function to call, and %dx containing
367 * the "args". 405 * the "args".
368 */ 406 */
369extern void kernel_thread_helper(void); 407extern void kernel_thread_helper(void);
@@ -377,16 +415,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
377 415
378 memset(&regs, 0, sizeof(regs)); 416 memset(&regs, 0, sizeof(regs));
379 417
380 regs.ebx = (unsigned long) fn; 418 regs.bx = (unsigned long) fn;
381 regs.edx = (unsigned long) arg; 419 regs.dx = (unsigned long) arg;
382 420
383 regs.xds = __USER_DS; 421 regs.ds = __USER_DS;
384 regs.xes = __USER_DS; 422 regs.es = __USER_DS;
385 regs.xfs = __KERNEL_PERCPU; 423 regs.fs = __KERNEL_PERCPU;
386 regs.orig_eax = -1; 424 regs.orig_ax = -1;
387 regs.eip = (unsigned long) kernel_thread_helper; 425 regs.ip = (unsigned long) kernel_thread_helper;
388 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 426 regs.cs = __KERNEL_CS | get_kernel_rpl();
389 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; 427 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
390 428
391 /* Ok, create the new process.. */ 429 /* Ok, create the new process.. */
392 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 430 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -424,7 +462,12 @@ void flush_thread(void)
424{ 462{
425 struct task_struct *tsk = current; 463 struct task_struct *tsk = current;
426 464
427 memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); 465 tsk->thread.debugreg0 = 0;
466 tsk->thread.debugreg1 = 0;
467 tsk->thread.debugreg2 = 0;
468 tsk->thread.debugreg3 = 0;
469 tsk->thread.debugreg6 = 0;
470 tsk->thread.debugreg7 = 0;
428 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 471 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
429 clear_tsk_thread_flag(tsk, TIF_DEBUG); 472 clear_tsk_thread_flag(tsk, TIF_DEBUG);
430 /* 473 /*
@@ -449,7 +492,7 @@ void prepare_to_copy(struct task_struct *tsk)
449 unlazy_fpu(tsk); 492 unlazy_fpu(tsk);
450} 493}
451 494
452int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, 495int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
453 unsigned long unused, 496 unsigned long unused,
454 struct task_struct * p, struct pt_regs * regs) 497 struct task_struct * p, struct pt_regs * regs)
455{ 498{
@@ -459,15 +502,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
459 502
460 childregs = task_pt_regs(p); 503 childregs = task_pt_regs(p);
461 *childregs = *regs; 504 *childregs = *regs;
462 childregs->eax = 0; 505 childregs->ax = 0;
463 childregs->esp = esp; 506 childregs->sp = sp;
464 507
465 p->thread.esp = (unsigned long) childregs; 508 p->thread.sp = (unsigned long) childregs;
466 p->thread.esp0 = (unsigned long) (childregs+1); 509 p->thread.sp0 = (unsigned long) (childregs+1);
467 510
468 p->thread.eip = (unsigned long) ret_from_fork; 511 p->thread.ip = (unsigned long) ret_from_fork;
469 512
470 savesegment(gs,p->thread.gs); 513 savesegment(gs, p->thread.gs);
471 514
472 tsk = current; 515 tsk = current;
473 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 516 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -480,32 +523,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
480 set_tsk_thread_flag(p, TIF_IO_BITMAP); 523 set_tsk_thread_flag(p, TIF_IO_BITMAP);
481 } 524 }
482 525
526 err = 0;
527
483 /* 528 /*
484 * Set a new TLS for the child thread? 529 * Set a new TLS for the child thread?
485 */ 530 */
486 if (clone_flags & CLONE_SETTLS) { 531 if (clone_flags & CLONE_SETTLS)
487 struct desc_struct *desc; 532 err = do_set_thread_area(p, -1,
488 struct user_desc info; 533 (struct user_desc __user *)childregs->si, 0);
489 int idx;
490
491 err = -EFAULT;
492 if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info)))
493 goto out;
494 err = -EINVAL;
495 if (LDT_empty(&info))
496 goto out;
497
498 idx = info.entry_number;
499 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
500 goto out;
501
502 desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
503 desc->a = LDT_entry_a(&info);
504 desc->b = LDT_entry_b(&info);
505 }
506 534
507 err = 0;
508 out:
509 if (err && p->thread.io_bitmap_ptr) { 535 if (err && p->thread.io_bitmap_ptr) {
510 kfree(p->thread.io_bitmap_ptr); 536 kfree(p->thread.io_bitmap_ptr);
511 p->thread.io_bitmap_max = 0; 537 p->thread.io_bitmap_max = 0;
@@ -518,62 +544,52 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
518 */ 544 */
519void dump_thread(struct pt_regs * regs, struct user * dump) 545void dump_thread(struct pt_regs * regs, struct user * dump)
520{ 546{
521 int i; 547 u16 gs;
522 548
523/* changed the size calculations - should hopefully work better. lbt */ 549/* changed the size calculations - should hopefully work better. lbt */
524 dump->magic = CMAGIC; 550 dump->magic = CMAGIC;
525 dump->start_code = 0; 551 dump->start_code = 0;
526 dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); 552 dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
527 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; 553 dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
528 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; 554 dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
529 dump->u_dsize -= dump->u_tsize; 555 dump->u_dsize -= dump->u_tsize;
530 dump->u_ssize = 0; 556 dump->u_ssize = 0;
531 for (i = 0; i < 8; i++) 557 dump->u_debugreg[0] = current->thread.debugreg0;
532 dump->u_debugreg[i] = current->thread.debugreg[i]; 558 dump->u_debugreg[1] = current->thread.debugreg1;
559 dump->u_debugreg[2] = current->thread.debugreg2;
560 dump->u_debugreg[3] = current->thread.debugreg3;
561 dump->u_debugreg[4] = 0;
562 dump->u_debugreg[5] = 0;
563 dump->u_debugreg[6] = current->thread.debugreg6;
564 dump->u_debugreg[7] = current->thread.debugreg7;
533 565
534 if (dump->start_stack < TASK_SIZE) 566 if (dump->start_stack < TASK_SIZE)
535 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; 567 dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT;
536 568
537 dump->regs.ebx = regs->ebx; 569 dump->regs.bx = regs->bx;
538 dump->regs.ecx = regs->ecx; 570 dump->regs.cx = regs->cx;
539 dump->regs.edx = regs->edx; 571 dump->regs.dx = regs->dx;
540 dump->regs.esi = regs->esi; 572 dump->regs.si = regs->si;
541 dump->regs.edi = regs->edi; 573 dump->regs.di = regs->di;
542 dump->regs.ebp = regs->ebp; 574 dump->regs.bp = regs->bp;
543 dump->regs.eax = regs->eax; 575 dump->regs.ax = regs->ax;
544 dump->regs.ds = regs->xds; 576 dump->regs.ds = (u16)regs->ds;
545 dump->regs.es = regs->xes; 577 dump->regs.es = (u16)regs->es;
546 dump->regs.fs = regs->xfs; 578 dump->regs.fs = (u16)regs->fs;
547 savesegment(gs,dump->regs.gs); 579 savesegment(gs,gs);
548 dump->regs.orig_eax = regs->orig_eax; 580 dump->regs.orig_ax = regs->orig_ax;
549 dump->regs.eip = regs->eip; 581 dump->regs.ip = regs->ip;
550 dump->regs.cs = regs->xcs; 582 dump->regs.cs = (u16)regs->cs;
551 dump->regs.eflags = regs->eflags; 583 dump->regs.flags = regs->flags;
552 dump->regs.esp = regs->esp; 584 dump->regs.sp = regs->sp;
553 dump->regs.ss = regs->xss; 585 dump->regs.ss = (u16)regs->ss;
554 586
555 dump->u_fpvalid = dump_fpu (regs, &dump->i387); 587 dump->u_fpvalid = dump_fpu (regs, &dump->i387);
556} 588}
557EXPORT_SYMBOL(dump_thread); 589EXPORT_SYMBOL(dump_thread);
558 590
559/*
560 * Capture the user space registers if the task is not running (in user space)
561 */
562int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
563{
564 struct pt_regs ptregs = *task_pt_regs(tsk);
565 ptregs.xcs &= 0xffff;
566 ptregs.xds &= 0xffff;
567 ptregs.xes &= 0xffff;
568 ptregs.xss &= 0xffff;
569
570 elf_core_copy_regs(regs, &ptregs);
571
572 return 1;
573}
574
575#ifdef CONFIG_SECCOMP 591#ifdef CONFIG_SECCOMP
576void hard_disable_TSC(void) 592static void hard_disable_TSC(void)
577{ 593{
578 write_cr4(read_cr4() | X86_CR4_TSD); 594 write_cr4(read_cr4() | X86_CR4_TSD);
579} 595}
@@ -588,7 +604,7 @@ void disable_TSC(void)
588 hard_disable_TSC(); 604 hard_disable_TSC();
589 preempt_enable(); 605 preempt_enable();
590} 606}
591void hard_enable_TSC(void) 607static void hard_enable_TSC(void)
592{ 608{
593 write_cr4(read_cr4() & ~X86_CR4_TSD); 609 write_cr4(read_cr4() & ~X86_CR4_TSD);
594} 610}
@@ -598,18 +614,32 @@ static noinline void
598__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 614__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
599 struct tss_struct *tss) 615 struct tss_struct *tss)
600{ 616{
601 struct thread_struct *next; 617 struct thread_struct *prev, *next;
618 unsigned long debugctl;
602 619
620 prev = &prev_p->thread;
603 next = &next_p->thread; 621 next = &next_p->thread;
604 622
623 debugctl = prev->debugctlmsr;
624 if (next->ds_area_msr != prev->ds_area_msr) {
625 /* we clear debugctl to make sure DS
626 * is not in use when we change it */
627 debugctl = 0;
628 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
629 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
630 }
631
632 if (next->debugctlmsr != debugctl)
633 wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
634
605 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 635 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
606 set_debugreg(next->debugreg[0], 0); 636 set_debugreg(next->debugreg0, 0);
607 set_debugreg(next->debugreg[1], 1); 637 set_debugreg(next->debugreg1, 1);
608 set_debugreg(next->debugreg[2], 2); 638 set_debugreg(next->debugreg2, 2);
609 set_debugreg(next->debugreg[3], 3); 639 set_debugreg(next->debugreg3, 3);
610 /* no 4 and 5 */ 640 /* no 4 and 5 */
611 set_debugreg(next->debugreg[6], 6); 641 set_debugreg(next->debugreg6, 6);
612 set_debugreg(next->debugreg[7], 7); 642 set_debugreg(next->debugreg7, 7);
613 } 643 }
614 644
615#ifdef CONFIG_SECCOMP 645#ifdef CONFIG_SECCOMP
@@ -623,6 +653,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
623 } 653 }
624#endif 654#endif
625 655
656 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
657 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
658
659 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
660 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
661
662
626 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 663 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
627 /* 664 /*
628 * Disable the bitmap via an invalid offset. We still cache 665 * Disable the bitmap via an invalid offset. We still cache
@@ -676,11 +713,11 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
676 * More important, however, is the fact that this allows us much 713 * More important, however, is the fact that this allows us much
677 * more flexibility. 714 * more flexibility.
678 * 715 *
679 * The return value (in %eax) will be the "prev" task after 716 * The return value (in %ax) will be the "prev" task after
680 * the task-switch, and shows up in ret_from_fork in entry.S, 717 * the task-switch, and shows up in ret_from_fork in entry.S,
681 * for example. 718 * for example.
682 */ 719 */
683struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 720struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
684{ 721{
685 struct thread_struct *prev = &prev_p->thread, 722 struct thread_struct *prev = &prev_p->thread,
686 *next = &next_p->thread; 723 *next = &next_p->thread;
@@ -699,7 +736,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
699 /* 736 /*
700 * Reload esp0. 737 * Reload esp0.
701 */ 738 */
702 load_esp0(tss, next); 739 load_sp0(tss, next);
703 740
704 /* 741 /*
705 * Save away %gs. No need to save %fs, as it was saved on the 742 * Save away %gs. No need to save %fs, as it was saved on the
@@ -763,7 +800,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
763 800
764asmlinkage int sys_fork(struct pt_regs regs) 801asmlinkage int sys_fork(struct pt_regs regs)
765{ 802{
766 return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 803 return do_fork(SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
767} 804}
768 805
769asmlinkage int sys_clone(struct pt_regs regs) 806asmlinkage int sys_clone(struct pt_regs regs)
@@ -772,12 +809,12 @@ asmlinkage int sys_clone(struct pt_regs regs)
772 unsigned long newsp; 809 unsigned long newsp;
773 int __user *parent_tidptr, *child_tidptr; 810 int __user *parent_tidptr, *child_tidptr;
774 811
775 clone_flags = regs.ebx; 812 clone_flags = regs.bx;
776 newsp = regs.ecx; 813 newsp = regs.cx;
777 parent_tidptr = (int __user *)regs.edx; 814 parent_tidptr = (int __user *)regs.dx;
778 child_tidptr = (int __user *)regs.edi; 815 child_tidptr = (int __user *)regs.di;
779 if (!newsp) 816 if (!newsp)
780 newsp = regs.esp; 817 newsp = regs.sp;
781 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr); 818 return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
782} 819}
783 820
@@ -793,7 +830,7 @@ asmlinkage int sys_clone(struct pt_regs regs)
793 */ 830 */
794asmlinkage int sys_vfork(struct pt_regs regs) 831asmlinkage int sys_vfork(struct pt_regs regs)
795{ 832{
796 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, NULL, NULL); 833 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, &regs, 0, NULL, NULL);
797} 834}
798 835
799/* 836/*
@@ -804,18 +841,15 @@ asmlinkage int sys_execve(struct pt_regs regs)
804 int error; 841 int error;
805 char * filename; 842 char * filename;
806 843
807 filename = getname((char __user *) regs.ebx); 844 filename = getname((char __user *) regs.bx);
808 error = PTR_ERR(filename); 845 error = PTR_ERR(filename);
809 if (IS_ERR(filename)) 846 if (IS_ERR(filename))
810 goto out; 847 goto out;
811 error = do_execve(filename, 848 error = do_execve(filename,
812 (char __user * __user *) regs.ecx, 849 (char __user * __user *) regs.cx,
813 (char __user * __user *) regs.edx, 850 (char __user * __user *) regs.dx,
814 &regs); 851 &regs);
815 if (error == 0) { 852 if (error == 0) {
816 task_lock(current);
817 current->ptrace &= ~PT_DTRACE;
818 task_unlock(current);
819 /* Make sure we don't return using sysenter.. */ 853 /* Make sure we don't return using sysenter.. */
820 set_thread_flag(TIF_IRET); 854 set_thread_flag(TIF_IRET);
821 } 855 }
@@ -829,145 +863,37 @@ out:
829 863
830unsigned long get_wchan(struct task_struct *p) 864unsigned long get_wchan(struct task_struct *p)
831{ 865{
832 unsigned long ebp, esp, eip; 866 unsigned long bp, sp, ip;
833 unsigned long stack_page; 867 unsigned long stack_page;
834 int count = 0; 868 int count = 0;
835 if (!p || p == current || p->state == TASK_RUNNING) 869 if (!p || p == current || p->state == TASK_RUNNING)
836 return 0; 870 return 0;
837 stack_page = (unsigned long)task_stack_page(p); 871 stack_page = (unsigned long)task_stack_page(p);
838 esp = p->thread.esp; 872 sp = p->thread.sp;
839 if (!stack_page || esp < stack_page || esp > top_esp+stack_page) 873 if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
840 return 0; 874 return 0;
841 /* include/asm-i386/system.h:switch_to() pushes ebp last. */ 875 /* include/asm-i386/system.h:switch_to() pushes bp last. */
842 ebp = *(unsigned long *) esp; 876 bp = *(unsigned long *) sp;
843 do { 877 do {
844 if (ebp < stack_page || ebp > top_ebp+stack_page) 878 if (bp < stack_page || bp > top_ebp+stack_page)
845 return 0; 879 return 0;
846 eip = *(unsigned long *) (ebp+4); 880 ip = *(unsigned long *) (bp+4);
847 if (!in_sched_functions(eip)) 881 if (!in_sched_functions(ip))
848 return eip; 882 return ip;
849 ebp = *(unsigned long *) ebp; 883 bp = *(unsigned long *) bp;
850 } while (count++ < 16); 884 } while (count++ < 16);
851 return 0; 885 return 0;
852} 886}
853 887
854/*
855 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
856 */
857static int get_free_idx(void)
858{
859 struct thread_struct *t = &current->thread;
860 int idx;
861
862 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
863 if (desc_empty(t->tls_array + idx))
864 return idx + GDT_ENTRY_TLS_MIN;
865 return -ESRCH;
866}
867
868/*
869 * Set a given TLS descriptor:
870 */
871asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
872{
873 struct thread_struct *t = &current->thread;
874 struct user_desc info;
875 struct desc_struct *desc;
876 int cpu, idx;
877
878 if (copy_from_user(&info, u_info, sizeof(info)))
879 return -EFAULT;
880 idx = info.entry_number;
881
882 /*
883 * index -1 means the kernel should try to find and
884 * allocate an empty descriptor:
885 */
886 if (idx == -1) {
887 idx = get_free_idx();
888 if (idx < 0)
889 return idx;
890 if (put_user(idx, &u_info->entry_number))
891 return -EFAULT;
892 }
893
894 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
895 return -EINVAL;
896
897 desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
898
899 /*
900 * We must not get preempted while modifying the TLS.
901 */
902 cpu = get_cpu();
903
904 if (LDT_empty(&info)) {
905 desc->a = 0;
906 desc->b = 0;
907 } else {
908 desc->a = LDT_entry_a(&info);
909 desc->b = LDT_entry_b(&info);
910 }
911 load_TLS(t, cpu);
912
913 put_cpu();
914
915 return 0;
916}
917
918/*
919 * Get the current Thread-Local Storage area:
920 */
921
922#define GET_BASE(desc) ( \
923 (((desc)->a >> 16) & 0x0000ffff) | \
924 (((desc)->b << 16) & 0x00ff0000) | \
925 ( (desc)->b & 0xff000000) )
926
927#define GET_LIMIT(desc) ( \
928 ((desc)->a & 0x0ffff) | \
929 ((desc)->b & 0xf0000) )
930
931#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
932#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
933#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
934#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
935#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
936#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
937
938asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
939{
940 struct user_desc info;
941 struct desc_struct *desc;
942 int idx;
943
944 if (get_user(idx, &u_info->entry_number))
945 return -EFAULT;
946 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
947 return -EINVAL;
948
949 memset(&info, 0, sizeof(info));
950
951 desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
952
953 info.entry_number = idx;
954 info.base_addr = GET_BASE(desc);
955 info.limit = GET_LIMIT(desc);
956 info.seg_32bit = GET_32BIT(desc);
957 info.contents = GET_CONTENTS(desc);
958 info.read_exec_only = !GET_WRITABLE(desc);
959 info.limit_in_pages = GET_LIMIT_PAGES(desc);
960 info.seg_not_present = !GET_PRESENT(desc);
961 info.useable = GET_USEABLE(desc);
962
963 if (copy_to_user(u_info, &info, sizeof(info)))
964 return -EFAULT;
965 return 0;
966}
967
968unsigned long arch_align_stack(unsigned long sp) 888unsigned long arch_align_stack(unsigned long sp)
969{ 889{
970 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 890 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
971 sp -= get_random_int() % 8192; 891 sp -= get_random_int() % 8192;
972 return sp & ~0xf; 892 return sp & ~0xf;
973} 893}
894
895unsigned long arch_randomize_brk(struct mm_struct *mm)
896{
897 unsigned long range_end = mm->brk + 0x02000000;
898 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
899}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6309b275cb9c..137a86171c39 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Pentium III FXSR, SSE support 4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000 5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 * 6 *
7 * X86-64 port 7 * X86-64 port
8 * Andi Kleen. 8 * Andi Kleen.
9 * 9 *
@@ -19,19 +19,19 @@
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/fs.h>
22#include <linux/kernel.h> 23#include <linux/kernel.h>
23#include <linux/mm.h> 24#include <linux/mm.h>
24#include <linux/fs.h>
25#include <linux/elfcore.h> 25#include <linux/elfcore.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/user.h> 28#include <linux/user.h>
29#include <linux/module.h>
30#include <linux/a.out.h> 29#include <linux/a.out.h>
31#include <linux/interrupt.h> 30#include <linux/interrupt.h>
31#include <linux/utsname.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/module.h>
33#include <linux/ptrace.h> 34#include <linux/ptrace.h>
34#include <linux/utsname.h>
35#include <linux/random.h> 35#include <linux/random.h>
36#include <linux/notifier.h> 36#include <linux/notifier.h>
37#include <linux/kprobes.h> 37#include <linux/kprobes.h>
@@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n)
72{ 72{
73 atomic_notifier_chain_register(&idle_notifier, n); 73 atomic_notifier_chain_register(&idle_notifier, n);
74} 74}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
79 atomic_notifier_chain_unregister(&idle_notifier, n);
80}
81EXPORT_SYMBOL(idle_notifier_unregister);
82 75
83void enter_idle(void) 76void enter_idle(void)
84{ 77{
@@ -106,7 +99,7 @@ void exit_idle(void)
106 * We use this if we don't have any better 99 * We use this if we don't have any better
107 * idle routine.. 100 * idle routine..
108 */ 101 */
109static void default_idle(void) 102void default_idle(void)
110{ 103{
111 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
112 /* 105 /*
@@ -116,11 +109,18 @@ static void default_idle(void)
116 smp_mb(); 109 smp_mb();
117 local_irq_disable(); 110 local_irq_disable();
118 if (!need_resched()) { 111 if (!need_resched()) {
119 /* Enables interrupts one instruction before HLT. 112 ktime_t t0, t1;
120 x86 special cases this so there is no race. */ 113 u64 t0n, t1n;
121 safe_halt(); 114
122 } else 115 t0 = ktime_get();
123 local_irq_enable(); 116 t0n = ktime_to_ns(t0);
117 safe_halt(); /* enables interrupts racelessly */
118 local_irq_disable();
119 t1 = ktime_get();
120 t1n = ktime_to_ns(t1);
121 sched_clock_idle_wakeup_event(t1n - t0n);
122 }
123 local_irq_enable();
124 current_thread_info()->status |= TS_POLLING; 124 current_thread_info()->status |= TS_POLLING;
125} 125}
126 126
@@ -129,43 +129,12 @@ static void default_idle(void)
129 * to poll the ->need_resched flag instead of waiting for the 129 * to poll the ->need_resched flag instead of waiting for the
130 * cross-CPU IPI to arrive. Use this option with caution. 130 * cross-CPU IPI to arrive. Use this option with caution.
131 */ 131 */
132static void poll_idle (void) 132static void poll_idle(void)
133{ 133{
134 local_irq_enable(); 134 local_irq_enable();
135 cpu_relax(); 135 cpu_relax();
136} 136}
137 137
138void cpu_idle_wait(void)
139{
140 unsigned int cpu, this_cpu = get_cpu();
141 cpumask_t map, tmp = current->cpus_allowed;
142
143 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
144 put_cpu();
145
146 cpus_clear(map);
147 for_each_online_cpu(cpu) {
148 per_cpu(cpu_idle_state, cpu) = 1;
149 cpu_set(cpu, map);
150 }
151
152 __get_cpu_var(cpu_idle_state) = 0;
153
154 wmb();
155 do {
156 ssleep(1);
157 for_each_online_cpu(cpu) {
158 if (cpu_isset(cpu, map) &&
159 !per_cpu(cpu_idle_state, cpu))
160 cpu_clear(cpu, map);
161 }
162 cpus_and(map, map, cpu_online_map);
163 } while (!cpus_empty(map));
164
165 set_cpus_allowed(current, tmp);
166}
167EXPORT_SYMBOL_GPL(cpu_idle_wait);
168
169#ifdef CONFIG_HOTPLUG_CPU 138#ifdef CONFIG_HOTPLUG_CPU
170DECLARE_PER_CPU(int, cpu_state); 139DECLARE_PER_CPU(int, cpu_state);
171 140
@@ -196,19 +165,18 @@ static inline void play_dead(void)
196 * low exit latency (ie sit in a loop waiting for 165 * low exit latency (ie sit in a loop waiting for
197 * somebody to say that they'd like to reschedule) 166 * somebody to say that they'd like to reschedule)
198 */ 167 */
199void cpu_idle (void) 168void cpu_idle(void)
200{ 169{
201 current_thread_info()->status |= TS_POLLING; 170 current_thread_info()->status |= TS_POLLING;
202 /* endless idle loop with no priority at all */ 171 /* endless idle loop with no priority at all */
203 while (1) { 172 while (1) {
173 tick_nohz_stop_sched_tick();
204 while (!need_resched()) { 174 while (!need_resched()) {
205 void (*idle)(void); 175 void (*idle)(void);
206 176
207 if (__get_cpu_var(cpu_idle_state)) 177 if (__get_cpu_var(cpu_idle_state))
208 __get_cpu_var(cpu_idle_state) = 0; 178 __get_cpu_var(cpu_idle_state) = 0;
209 179
210 tick_nohz_stop_sched_tick();
211
212 rmb(); 180 rmb();
213 idle = pm_idle; 181 idle = pm_idle;
214 if (!idle) 182 if (!idle)
@@ -236,6 +204,47 @@ void cpu_idle (void)
236 } 204 }
237} 205}
238 206
207static void do_nothing(void *unused)
208{
209}
210
211void cpu_idle_wait(void)
212{
213 unsigned int cpu, this_cpu = get_cpu();
214 cpumask_t map, tmp = current->cpus_allowed;
215
216 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
217 put_cpu();
218
219 cpus_clear(map);
220 for_each_online_cpu(cpu) {
221 per_cpu(cpu_idle_state, cpu) = 1;
222 cpu_set(cpu, map);
223 }
224
225 __get_cpu_var(cpu_idle_state) = 0;
226
227 wmb();
228 do {
229 ssleep(1);
230 for_each_online_cpu(cpu) {
231 if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
232 cpu_clear(cpu, map);
233 }
234 cpus_and(map, map, cpu_online_map);
235 /*
236 * We waited 1 sec, if a CPU still did not call idle
237 * it may be because it is in idle and not waking up
238 * because it has nothing to do.
239 * Give all the remaining CPUS a kick.
240 */
241 smp_call_function_mask(map, do_nothing, 0, 0);
242 } while (!cpus_empty(map));
243
244 set_cpus_allowed(current, tmp);
245}
246EXPORT_SYMBOL_GPL(cpu_idle_wait);
247
239/* 248/*
240 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, 249 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
241 * which can obviate IPI to trigger checking of need_resched. 250 * which can obviate IPI to trigger checking of need_resched.
@@ -246,13 +255,13 @@ void cpu_idle (void)
246 * New with Core Duo processors, MWAIT can take some hints based on CPU 255 * New with Core Duo processors, MWAIT can take some hints based on CPU
247 * capability. 256 * capability.
248 */ 257 */
249void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 258void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
250{ 259{
251 if (!need_resched()) { 260 if (!need_resched()) {
252 __monitor((void *)&current_thread_info()->flags, 0, 0); 261 __monitor((void *)&current_thread_info()->flags, 0, 0);
253 smp_mb(); 262 smp_mb();
254 if (!need_resched()) 263 if (!need_resched())
255 __mwait(eax, ecx); 264 __mwait(ax, cx);
256 } 265 }
257} 266}
258 267
@@ -271,25 +280,41 @@ static void mwait_idle(void)
271 } 280 }
272} 281}
273 282
283
284static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
285{
286 if (force_mwait)
287 return 1;
288 /* Any C1 states supported? */
289 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
290}
291
274void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) 292void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
275{ 293{
276 static int printed; 294 static int selected;
277 if (cpu_has(c, X86_FEATURE_MWAIT)) { 295
296 if (selected)
297 return;
298#ifdef CONFIG_X86_SMP
299 if (pm_idle == poll_idle && smp_num_siblings > 1) {
300 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
301 " performance may degrade.\n");
302 }
303#endif
304 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
278 /* 305 /*
279 * Skip, if setup has overridden idle. 306 * Skip, if setup has overridden idle.
280 * One CPU supports mwait => All CPUs supports mwait 307 * One CPU supports mwait => All CPUs supports mwait
281 */ 308 */
282 if (!pm_idle) { 309 if (!pm_idle) {
283 if (!printed) { 310 printk(KERN_INFO "using mwait in idle threads.\n");
284 printk(KERN_INFO "using mwait in idle threads.\n");
285 printed = 1;
286 }
287 pm_idle = mwait_idle; 311 pm_idle = mwait_idle;
288 } 312 }
289 } 313 }
314 selected = 1;
290} 315}
291 316
292static int __init idle_setup (char *str) 317static int __init idle_setup(char *str)
293{ 318{
294 if (!strcmp(str, "poll")) { 319 if (!strcmp(str, "poll")) {
295 printk("using polling idle threads.\n"); 320 printk("using polling idle threads.\n");
@@ -304,13 +329,13 @@ static int __init idle_setup (char *str)
304} 329}
305early_param("idle", idle_setup); 330early_param("idle", idle_setup);
306 331
307/* Prints also some state that isn't saved in the pt_regs */ 332/* Prints also some state that isn't saved in the pt_regs */
308void __show_regs(struct pt_regs * regs) 333void __show_regs(struct pt_regs * regs)
309{ 334{
310 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 335 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
311 unsigned long d0, d1, d2, d3, d6, d7; 336 unsigned long d0, d1, d2, d3, d6, d7;
312 unsigned int fsindex,gsindex; 337 unsigned int fsindex, gsindex;
313 unsigned int ds,cs,es; 338 unsigned int ds, cs, es;
314 339
315 printk("\n"); 340 printk("\n");
316 print_modules(); 341 print_modules();
@@ -319,16 +344,16 @@ void __show_regs(struct pt_regs * regs)
319 init_utsname()->release, 344 init_utsname()->release,
320 (int)strcspn(init_utsname()->version, " "), 345 (int)strcspn(init_utsname()->version, " "),
321 init_utsname()->version); 346 init_utsname()->version);
322 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); 347 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
323 printk_address(regs->rip); 348 printk_address(regs->ip, 1);
324 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, 349 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
325 regs->eflags); 350 regs->flags);
326 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 351 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
327 regs->rax, regs->rbx, regs->rcx); 352 regs->ax, regs->bx, regs->cx);
328 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 353 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
329 regs->rdx, regs->rsi, regs->rdi); 354 regs->dx, regs->si, regs->di);
330 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 355 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
331 regs->rbp, regs->r8, regs->r9); 356 regs->bp, regs->r8, regs->r9);
332 printk("R10: %016lx R11: %016lx R12: %016lx\n", 357 printk("R10: %016lx R11: %016lx R12: %016lx\n",
333 regs->r10, regs->r11, regs->r12); 358 regs->r10, regs->r11, regs->r12);
334 printk("R13: %016lx R14: %016lx R15: %016lx\n", 359 printk("R13: %016lx R14: %016lx R15: %016lx\n",
@@ -368,7 +393,7 @@ void show_regs(struct pt_regs *regs)
368{ 393{
369 printk("CPU %d:", smp_processor_id()); 394 printk("CPU %d:", smp_processor_id());
370 __show_regs(regs); 395 __show_regs(regs);
371 show_trace(NULL, regs, (void *)(regs + 1)); 396 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
372} 397}
373 398
374/* 399/*
@@ -379,7 +404,7 @@ void exit_thread(void)
379 struct task_struct *me = current; 404 struct task_struct *me = current;
380 struct thread_struct *t = &me->thread; 405 struct thread_struct *t = &me->thread;
381 406
382 if (me->thread.io_bitmap_ptr) { 407 if (me->thread.io_bitmap_ptr) {
383 struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); 408 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
384 409
385 kfree(t->io_bitmap_ptr); 410 kfree(t->io_bitmap_ptr);
@@ -415,7 +440,7 @@ void flush_thread(void)
415 tsk->thread.debugreg3 = 0; 440 tsk->thread.debugreg3 = 0;
416 tsk->thread.debugreg6 = 0; 441 tsk->thread.debugreg6 = 0;
417 tsk->thread.debugreg7 = 0; 442 tsk->thread.debugreg7 = 0;
418 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 443 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
419 /* 444 /*
420 * Forget coprocessor state.. 445 * Forget coprocessor state..
421 */ 446 */
@@ -438,26 +463,21 @@ void release_thread(struct task_struct *dead_task)
438 463
439static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 464static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
440{ 465{
441 struct user_desc ud = { 466 struct user_desc ud = {
442 .base_addr = addr, 467 .base_addr = addr,
443 .limit = 0xfffff, 468 .limit = 0xfffff,
444 .seg_32bit = 1, 469 .seg_32bit = 1,
445 .limit_in_pages = 1, 470 .limit_in_pages = 1,
446 .useable = 1, 471 .useable = 1,
447 }; 472 };
448 struct n_desc_struct *desc = (void *)t->thread.tls_array; 473 struct desc_struct *desc = t->thread.tls_array;
449 desc += tls; 474 desc += tls;
450 desc->a = LDT_entry_a(&ud); 475 fill_ldt(desc, &ud);
451 desc->b = LDT_entry_b(&ud);
452} 476}
453 477
454static inline u32 read_32bit_tls(struct task_struct *t, int tls) 478static inline u32 read_32bit_tls(struct task_struct *t, int tls)
455{ 479{
456 struct desc_struct *desc = (void *)t->thread.tls_array; 480 return get_desc_base(&t->thread.tls_array[tls]);
457 desc += tls;
458 return desc->base0 |
459 (((u32)desc->base1) << 16) |
460 (((u32)desc->base2) << 24);
461} 481}
462 482
463/* 483/*
@@ -469,7 +489,7 @@ void prepare_to_copy(struct task_struct *tsk)
469 unlazy_fpu(tsk); 489 unlazy_fpu(tsk);
470} 490}
471 491
472int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, 492int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
473 unsigned long unused, 493 unsigned long unused,
474 struct task_struct * p, struct pt_regs * regs) 494 struct task_struct * p, struct pt_regs * regs)
475{ 495{
@@ -481,14 +501,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
481 (THREAD_SIZE + task_stack_page(p))) - 1; 501 (THREAD_SIZE + task_stack_page(p))) - 1;
482 *childregs = *regs; 502 *childregs = *regs;
483 503
484 childregs->rax = 0; 504 childregs->ax = 0;
485 childregs->rsp = rsp; 505 childregs->sp = sp;
486 if (rsp == ~0UL) 506 if (sp == ~0UL)
487 childregs->rsp = (unsigned long)childregs; 507 childregs->sp = (unsigned long)childregs;
488 508
489 p->thread.rsp = (unsigned long) childregs; 509 p->thread.sp = (unsigned long) childregs;
490 p->thread.rsp0 = (unsigned long) (childregs+1); 510 p->thread.sp0 = (unsigned long) (childregs+1);
491 p->thread.userrsp = me->thread.userrsp; 511 p->thread.usersp = me->thread.usersp;
492 512
493 set_tsk_thread_flag(p, TIF_FORK); 513 set_tsk_thread_flag(p, TIF_FORK);
494 514
@@ -509,7 +529,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
509 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 529 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
510 IO_BITMAP_BYTES); 530 IO_BITMAP_BYTES);
511 set_tsk_thread_flag(p, TIF_IO_BITMAP); 531 set_tsk_thread_flag(p, TIF_IO_BITMAP);
512 } 532 }
513 533
514 /* 534 /*
515 * Set a new TLS for the child thread? 535 * Set a new TLS for the child thread?
@@ -517,7 +537,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
517 if (clone_flags & CLONE_SETTLS) { 537 if (clone_flags & CLONE_SETTLS) {
518#ifdef CONFIG_IA32_EMULATION 538#ifdef CONFIG_IA32_EMULATION
519 if (test_thread_flag(TIF_IA32)) 539 if (test_thread_flag(TIF_IA32))
520 err = ia32_child_tls(p, childregs); 540 err = do_set_thread_area(p, -1,
541 (struct user_desc __user *)childregs->si, 0);
521 else 542 else
522#endif 543#endif
523 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 544 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
@@ -536,17 +557,30 @@ out:
536/* 557/*
537 * This special macro can be used to load a debugging register 558 * This special macro can be used to load a debugging register
538 */ 559 */
539#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) 560#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
540 561
541static inline void __switch_to_xtra(struct task_struct *prev_p, 562static inline void __switch_to_xtra(struct task_struct *prev_p,
542 struct task_struct *next_p, 563 struct task_struct *next_p,
543 struct tss_struct *tss) 564 struct tss_struct *tss)
544{ 565{
545 struct thread_struct *prev, *next; 566 struct thread_struct *prev, *next;
567 unsigned long debugctl;
546 568
547 prev = &prev_p->thread, 569 prev = &prev_p->thread,
548 next = &next_p->thread; 570 next = &next_p->thread;
549 571
572 debugctl = prev->debugctlmsr;
573 if (next->ds_area_msr != prev->ds_area_msr) {
574 /* we clear debugctl to make sure DS
575 * is not in use when we change it */
576 debugctl = 0;
577 wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
578 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
579 }
580
581 if (next->debugctlmsr != debugctl)
582 wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
583
550 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { 584 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
551 loaddebug(next, 0); 585 loaddebug(next, 0);
552 loaddebug(next, 1); 586 loaddebug(next, 1);
@@ -570,12 +604,18 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
570 */ 604 */
571 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 605 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
572 } 606 }
607
608 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
609 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
610
611 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
612 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
573} 613}
574 614
575/* 615/*
576 * switch_to(x,y) should switch tasks from x to y. 616 * switch_to(x,y) should switch tasks from x to y.
577 * 617 *
578 * This could still be optimized: 618 * This could still be optimized:
579 * - fold all the options into a flag word and test it with a single test. 619 * - fold all the options into a flag word and test it with a single test.
580 * - could test fs/gs bitsliced 620 * - could test fs/gs bitsliced
581 * 621 *
@@ -586,7 +626,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
586{ 626{
587 struct thread_struct *prev = &prev_p->thread, 627 struct thread_struct *prev = &prev_p->thread,
588 *next = &next_p->thread; 628 *next = &next_p->thread;
589 int cpu = smp_processor_id(); 629 int cpu = smp_processor_id();
590 struct tss_struct *tss = &per_cpu(init_tss, cpu); 630 struct tss_struct *tss = &per_cpu(init_tss, cpu);
591 631
592 /* we're going to use this soon, after a few expensive things */ 632 /* we're going to use this soon, after a few expensive things */
@@ -596,7 +636,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
596 /* 636 /*
597 * Reload esp0, LDT and the page table pointer: 637 * Reload esp0, LDT and the page table pointer:
598 */ 638 */
599 tss->rsp0 = next->rsp0; 639 load_sp0(tss, next);
600 640
601 /* 641 /*
602 * Switch DS and ES. 642 * Switch DS and ES.
@@ -655,8 +695,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
655 /* 695 /*
656 * Switch the PDA and FPU contexts. 696 * Switch the PDA and FPU contexts.
657 */ 697 */
658 prev->userrsp = read_pda(oldrsp); 698 prev->usersp = read_pda(oldrsp);
659 write_pda(oldrsp, next->userrsp); 699 write_pda(oldrsp, next->usersp);
660 write_pda(pcurrent, next_p); 700 write_pda(pcurrent, next_p);
661 701
662 write_pda(kernelstack, 702 write_pda(kernelstack,
@@ -673,8 +713,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
673 /* 713 /*
674 * Now maybe reload the debug registers and handle I/O bitmaps 714 * Now maybe reload the debug registers and handle I/O bitmaps
675 */ 715 */
676 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) 716 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
677 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) 717 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
678 __switch_to_xtra(prev_p, next_p, tss); 718 __switch_to_xtra(prev_p, next_p, tss);
679 719
680 /* If the task has used fpu the last 5 timeslices, just do a full 720 /* If the task has used fpu the last 5 timeslices, just do a full
@@ -689,7 +729,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
689/* 729/*
690 * sys_execve() executes a new program. 730 * sys_execve() executes a new program.
691 */ 731 */
692asmlinkage 732asmlinkage
693long sys_execve(char __user *name, char __user * __user *argv, 733long sys_execve(char __user *name, char __user * __user *argv,
694 char __user * __user *envp, struct pt_regs regs) 734 char __user * __user *envp, struct pt_regs regs)
695{ 735{
@@ -701,11 +741,6 @@ long sys_execve(char __user *name, char __user * __user *argv,
701 if (IS_ERR(filename)) 741 if (IS_ERR(filename))
702 return error; 742 return error;
703 error = do_execve(filename, argv, envp, &regs); 743 error = do_execve(filename, argv, envp, &regs);
704 if (error == 0) {
705 task_lock(current);
706 current->ptrace &= ~PT_DTRACE;
707 task_unlock(current);
708 }
709 putname(filename); 744 putname(filename);
710 return error; 745 return error;
711} 746}
@@ -715,18 +750,18 @@ void set_personality_64bit(void)
715 /* inherit personality from parent */ 750 /* inherit personality from parent */
716 751
717 /* Make sure to be in 64bit mode */ 752 /* Make sure to be in 64bit mode */
718 clear_thread_flag(TIF_IA32); 753 clear_thread_flag(TIF_IA32);
719 754
720 /* TBD: overwrites user setup. Should have two bits. 755 /* TBD: overwrites user setup. Should have two bits.
721 But 64bit processes have always behaved this way, 756 But 64bit processes have always behaved this way,
722 so it's not too bad. The main problem is just that 757 so it's not too bad. The main problem is just that
723 32bit childs are affected again. */ 758 32bit childs are affected again. */
724 current->personality &= ~READ_IMPLIES_EXEC; 759 current->personality &= ~READ_IMPLIES_EXEC;
725} 760}
726 761
727asmlinkage long sys_fork(struct pt_regs *regs) 762asmlinkage long sys_fork(struct pt_regs *regs)
728{ 763{
729 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); 764 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
730} 765}
731 766
732asmlinkage long 767asmlinkage long
@@ -734,7 +769,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
734 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 769 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
735{ 770{
736 if (!newsp) 771 if (!newsp)
737 newsp = regs->rsp; 772 newsp = regs->sp;
738 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 773 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
739} 774}
740 775
@@ -750,29 +785,29 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
750 */ 785 */
751asmlinkage long sys_vfork(struct pt_regs *regs) 786asmlinkage long sys_vfork(struct pt_regs *regs)
752{ 787{
753 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, 788 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
754 NULL, NULL); 789 NULL, NULL);
755} 790}
756 791
757unsigned long get_wchan(struct task_struct *p) 792unsigned long get_wchan(struct task_struct *p)
758{ 793{
759 unsigned long stack; 794 unsigned long stack;
760 u64 fp,rip; 795 u64 fp,ip;
761 int count = 0; 796 int count = 0;
762 797
763 if (!p || p == current || p->state==TASK_RUNNING) 798 if (!p || p == current || p->state==TASK_RUNNING)
764 return 0; 799 return 0;
765 stack = (unsigned long)task_stack_page(p); 800 stack = (unsigned long)task_stack_page(p);
766 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) 801 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
767 return 0; 802 return 0;
768 fp = *(u64 *)(p->thread.rsp); 803 fp = *(u64 *)(p->thread.sp);
769 do { 804 do {
770 if (fp < (unsigned long)stack || 805 if (fp < (unsigned long)stack ||
771 fp > (unsigned long)stack+THREAD_SIZE) 806 fp > (unsigned long)stack+THREAD_SIZE)
772 return 0; 807 return 0;
773 rip = *(u64 *)(fp+8); 808 ip = *(u64 *)(fp+8);
774 if (!in_sched_functions(rip)) 809 if (!in_sched_functions(ip))
775 return rip; 810 return ip;
776 fp = *(u64 *)fp; 811 fp = *(u64 *)fp;
777 } while (count++ < 16); 812 } while (count++ < 16);
778 return 0; 813 return 0;
@@ -813,19 +848,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
813 /* Not strictly needed for fs, but do it for symmetry 848 /* Not strictly needed for fs, but do it for symmetry
814 with gs */ 849 with gs */
815 if (addr >= TASK_SIZE_OF(task)) 850 if (addr >= TASK_SIZE_OF(task))
816 return -EPERM; 851 return -EPERM;
817 cpu = get_cpu(); 852 cpu = get_cpu();
818 /* handle small bases via the GDT because that's faster to 853 /* handle small bases via the GDT because that's faster to
819 switch. */ 854 switch. */
820 if (addr <= 0xffffffff) { 855 if (addr <= 0xffffffff) {
821 set_32bit_tls(task, FS_TLS, addr); 856 set_32bit_tls(task, FS_TLS, addr);
822 if (doit) { 857 if (doit) {
823 load_TLS(&task->thread, cpu); 858 load_TLS(&task->thread, cpu);
824 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); 859 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
825 } 860 }
826 task->thread.fsindex = FS_TLS_SEL; 861 task->thread.fsindex = FS_TLS_SEL;
827 task->thread.fs = 0; 862 task->thread.fs = 0;
828 } else { 863 } else {
829 task->thread.fsindex = 0; 864 task->thread.fsindex = 0;
830 task->thread.fs = addr; 865 task->thread.fs = addr;
831 if (doit) { 866 if (doit) {
@@ -837,24 +872,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
837 } 872 }
838 put_cpu(); 873 put_cpu();
839 break; 874 break;
840 case ARCH_GET_FS: { 875 case ARCH_GET_FS: {
841 unsigned long base; 876 unsigned long base;
842 if (task->thread.fsindex == FS_TLS_SEL) 877 if (task->thread.fsindex == FS_TLS_SEL)
843 base = read_32bit_tls(task, FS_TLS); 878 base = read_32bit_tls(task, FS_TLS);
844 else if (doit) 879 else if (doit)
845 rdmsrl(MSR_FS_BASE, base); 880 rdmsrl(MSR_FS_BASE, base);
846 else 881 else
847 base = task->thread.fs; 882 base = task->thread.fs;
848 ret = put_user(base, (unsigned long __user *)addr); 883 ret = put_user(base, (unsigned long __user *)addr);
849 break; 884 break;
850 } 885 }
851 case ARCH_GET_GS: { 886 case ARCH_GET_GS: {
852 unsigned long base; 887 unsigned long base;
853 unsigned gsindex; 888 unsigned gsindex;
854 if (task->thread.gsindex == GS_TLS_SEL) 889 if (task->thread.gsindex == GS_TLS_SEL)
855 base = read_32bit_tls(task, GS_TLS); 890 base = read_32bit_tls(task, GS_TLS);
856 else if (doit) { 891 else if (doit) {
857 asm("movl %%gs,%0" : "=r" (gsindex)); 892 asm("movl %%gs,%0" : "=r" (gsindex));
858 if (gsindex) 893 if (gsindex)
859 rdmsrl(MSR_KERNEL_GS_BASE, base); 894 rdmsrl(MSR_KERNEL_GS_BASE, base);
860 else 895 else
@@ -862,39 +897,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
862 } 897 }
863 else 898 else
864 base = task->thread.gs; 899 base = task->thread.gs;
865 ret = put_user(base, (unsigned long __user *)addr); 900 ret = put_user(base, (unsigned long __user *)addr);
866 break; 901 break;
867 } 902 }
868 903
869 default: 904 default:
870 ret = -EINVAL; 905 ret = -EINVAL;
871 break; 906 break;
872 } 907 }
873 908
874 return ret; 909 return ret;
875} 910}
876 911
877long sys_arch_prctl(int code, unsigned long addr) 912long sys_arch_prctl(int code, unsigned long addr)
878{ 913{
879 return do_arch_prctl(current, code, addr); 914 return do_arch_prctl(current, code, addr);
880}
881
882/*
883 * Capture the user space registers if the task is not running (in user space)
884 */
885int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
886{
887 struct pt_regs *pp, ptregs;
888
889 pp = task_pt_regs(tsk);
890
891 ptregs = *pp;
892 ptregs.cs &= 0xffff;
893 ptregs.ss &= 0xffff;
894
895 elf_core_copy_regs(regs, &ptregs);
896
897 return 1;
898} 915}
899 916
900unsigned long arch_align_stack(unsigned long sp) 917unsigned long arch_align_stack(unsigned long sp)
@@ -903,3 +920,9 @@ unsigned long arch_align_stack(unsigned long sp)
903 sp -= get_random_int() % 8192; 920 sp -= get_random_int() % 8192;
904 return sp & ~0xf; 921 return sp & ~0xf;
905} 922}
923
924unsigned long arch_randomize_brk(struct mm_struct *mm)
925{
926 unsigned long range_end = mm->brk + 0x02000000;
927 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
928}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
new file mode 100644
index 000000000000..702c33efea84
--- /dev/null
+++ b/arch/x86/kernel/ptrace.c
@@ -0,0 +1,1566 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/regset.h>
17#include <linux/user.h>
18#include <linux/elf.h>
19#include <linux/security.h>
20#include <linux/audit.h>
21#include <linux/seccomp.h>
22#include <linux/signal.h>
23
24#include <asm/uaccess.h>
25#include <asm/pgtable.h>
26#include <asm/system.h>
27#include <asm/processor.h>
28#include <asm/i387.h>
29#include <asm/debugreg.h>
30#include <asm/ldt.h>
31#include <asm/desc.h>
32#include <asm/prctl.h>
33#include <asm/proto.h>
34#include <asm/ds.h>
35
36#include "tls.h"
37
38enum x86_regset {
39 REGSET_GENERAL,
40 REGSET_FP,
41 REGSET_XFP,
42 REGSET_TLS,
43};
44
45/*
46 * does not yet catch signals sent when the child dies.
47 * in exit.c or in signal.c.
48 */
49
50/*
51 * Determines which flags the user has access to [1 = access, 0 = no access].
52 */
53#define FLAG_MASK_32 ((unsigned long) \
54 (X86_EFLAGS_CF | X86_EFLAGS_PF | \
55 X86_EFLAGS_AF | X86_EFLAGS_ZF | \
56 X86_EFLAGS_SF | X86_EFLAGS_TF | \
57 X86_EFLAGS_DF | X86_EFLAGS_OF | \
58 X86_EFLAGS_RF | X86_EFLAGS_AC))
59
60/*
61 * Determines whether a value may be installed in a segment register.
62 */
63static inline bool invalid_selector(u16 value)
64{
65 return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL);
66}
67
68#ifdef CONFIG_X86_32
69
70#define FLAG_MASK FLAG_MASK_32
71
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2;
76 if (regno > FS)
77 --regno;
78 return &regs->bx + regno;
79}
80
81static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
82{
83 /*
84 * Returning the value truncates it to 16 bits.
85 */
86 unsigned int retval;
87 if (offset != offsetof(struct user_regs_struct, gs))
88 retval = *pt_regs_access(task_pt_regs(task), offset);
89 else {
90 retval = task->thread.gs;
91 if (task == current)
92 savesegment(gs, retval);
93 }
94 return retval;
95}
96
97static int set_segment_reg(struct task_struct *task,
98 unsigned long offset, u16 value)
99{
100 /*
101 * The value argument was already truncated to 16 bits.
102 */
103 if (invalid_selector(value))
104 return -EIO;
105
106 /*
107 * For %cs and %ss we cannot permit a null selector.
108 * We can permit a bogus selector as long as it has USER_RPL.
109 * Null selectors are fine for other segment registers, but
110 * we will never get back to user mode with invalid %cs or %ss
111 * and will take the trap in iret instead. Much code relies
112 * on user_mode() to distinguish a user trap frame (which can
113 * safely use invalid selectors) from a kernel trap frame.
114 */
115 switch (offset) {
116 case offsetof(struct user_regs_struct, cs):
117 case offsetof(struct user_regs_struct, ss):
118 if (unlikely(value == 0))
119 return -EIO;
120
121 default:
122 *pt_regs_access(task_pt_regs(task), offset) = value;
123 break;
124
125 case offsetof(struct user_regs_struct, gs):
126 task->thread.gs = value;
127 if (task == current)
128 /*
129 * The user-mode %gs is not affected by
130 * kernel entry, so we must update the CPU.
131 */
132 loadsegment(gs, value);
133 }
134
135 return 0;
136}
137
138static unsigned long debugreg_addr_limit(struct task_struct *task)
139{
140 return TASK_SIZE - 3;
141}
142
143#else /* CONFIG_X86_64 */
144
145#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
146
147static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
148{
149 BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0);
150 return &regs->r15 + (offset / sizeof(regs->r15));
151}
152
153static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
154{
155 /*
156 * Returning the value truncates it to 16 bits.
157 */
158 unsigned int seg;
159
160 switch (offset) {
161 case offsetof(struct user_regs_struct, fs):
162 if (task == current) {
163 /* Older gas can't assemble movq %?s,%r?? */
164 asm("movl %%fs,%0" : "=r" (seg));
165 return seg;
166 }
167 return task->thread.fsindex;
168 case offsetof(struct user_regs_struct, gs):
169 if (task == current) {
170 asm("movl %%gs,%0" : "=r" (seg));
171 return seg;
172 }
173 return task->thread.gsindex;
174 case offsetof(struct user_regs_struct, ds):
175 if (task == current) {
176 asm("movl %%ds,%0" : "=r" (seg));
177 return seg;
178 }
179 return task->thread.ds;
180 case offsetof(struct user_regs_struct, es):
181 if (task == current) {
182 asm("movl %%es,%0" : "=r" (seg));
183 return seg;
184 }
185 return task->thread.es;
186
187 case offsetof(struct user_regs_struct, cs):
188 case offsetof(struct user_regs_struct, ss):
189 break;
190 }
191 return *pt_regs_access(task_pt_regs(task), offset);
192}
193
194static int set_segment_reg(struct task_struct *task,
195 unsigned long offset, u16 value)
196{
197 /*
198 * The value argument was already truncated to 16 bits.
199 */
200 if (invalid_selector(value))
201 return -EIO;
202
203 switch (offset) {
204 case offsetof(struct user_regs_struct,fs):
205 /*
206 * If this is setting fs as for normal 64-bit use but
207 * setting fs_base has implicitly changed it, leave it.
208 */
209 if ((value == FS_TLS_SEL && task->thread.fsindex == 0 &&
210 task->thread.fs != 0) ||
211 (value == 0 && task->thread.fsindex == FS_TLS_SEL &&
212 task->thread.fs == 0))
213 break;
214 task->thread.fsindex = value;
215 if (task == current)
216 loadsegment(fs, task->thread.fsindex);
217 break;
218 case offsetof(struct user_regs_struct,gs):
219 /*
220 * If this is setting gs as for normal 64-bit use but
221 * setting gs_base has implicitly changed it, leave it.
222 */
223 if ((value == GS_TLS_SEL && task->thread.gsindex == 0 &&
224 task->thread.gs != 0) ||
225 (value == 0 && task->thread.gsindex == GS_TLS_SEL &&
226 task->thread.gs == 0))
227 break;
228 task->thread.gsindex = value;
229 if (task == current)
230 load_gs_index(task->thread.gsindex);
231 break;
232 case offsetof(struct user_regs_struct,ds):
233 task->thread.ds = value;
234 if (task == current)
235 loadsegment(ds, task->thread.ds);
236 break;
237 case offsetof(struct user_regs_struct,es):
238 task->thread.es = value;
239 if (task == current)
240 loadsegment(es, task->thread.es);
241 break;
242
243 /*
244 * Can't actually change these in 64-bit mode.
245 */
246 case offsetof(struct user_regs_struct,cs):
247 if (unlikely(value == 0))
248 return -EIO;
249#ifdef CONFIG_IA32_EMULATION
250 if (test_tsk_thread_flag(task, TIF_IA32))
251 task_pt_regs(task)->cs = value;
252#endif
253 break;
254 case offsetof(struct user_regs_struct,ss):
255 if (unlikely(value == 0))
256 return -EIO;
257#ifdef CONFIG_IA32_EMULATION
258 if (test_tsk_thread_flag(task, TIF_IA32))
259 task_pt_regs(task)->ss = value;
260#endif
261 break;
262 }
263
264 return 0;
265}
266
267static unsigned long debugreg_addr_limit(struct task_struct *task)
268{
269#ifdef CONFIG_IA32_EMULATION
270 if (test_tsk_thread_flag(task, TIF_IA32))
271 return IA32_PAGE_OFFSET - 3;
272#endif
273 return TASK_SIZE64 - 7;
274}
275
276#endif /* CONFIG_X86_32 */
277
278static unsigned long get_flags(struct task_struct *task)
279{
280 unsigned long retval = task_pt_regs(task)->flags;
281
282 /*
283 * If the debugger set TF, hide it from the readout.
284 */
285 if (test_tsk_thread_flag(task, TIF_FORCED_TF))
286 retval &= ~X86_EFLAGS_TF;
287
288 return retval;
289}
290
291static int set_flags(struct task_struct *task, unsigned long value)
292{
293 struct pt_regs *regs = task_pt_regs(task);
294
295 /*
296 * If the user value contains TF, mark that
297 * it was not "us" (the debugger) that set it.
298 * If not, make sure it stays set if we had.
299 */
300 if (value & X86_EFLAGS_TF)
301 clear_tsk_thread_flag(task, TIF_FORCED_TF);
302 else if (test_tsk_thread_flag(task, TIF_FORCED_TF))
303 value |= X86_EFLAGS_TF;
304
305 regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK);
306
307 return 0;
308}
309
310static int putreg(struct task_struct *child,
311 unsigned long offset, unsigned long value)
312{
313 switch (offset) {
314 case offsetof(struct user_regs_struct, cs):
315 case offsetof(struct user_regs_struct, ds):
316 case offsetof(struct user_regs_struct, es):
317 case offsetof(struct user_regs_struct, fs):
318 case offsetof(struct user_regs_struct, gs):
319 case offsetof(struct user_regs_struct, ss):
320 return set_segment_reg(child, offset, value);
321
322 case offsetof(struct user_regs_struct, flags):
323 return set_flags(child, value);
324
325#ifdef CONFIG_X86_64
326 case offsetof(struct user_regs_struct,fs_base):
327 if (value >= TASK_SIZE_OF(child))
328 return -EIO;
329 /*
330 * When changing the segment base, use do_arch_prctl
331 * to set either thread.fs or thread.fsindex and the
332 * corresponding GDT slot.
333 */
334 if (child->thread.fs != value)
335 return do_arch_prctl(child, ARCH_SET_FS, value);
336 return 0;
337 case offsetof(struct user_regs_struct,gs_base):
338 /*
339 * Exactly the same here as the %fs handling above.
340 */
341 if (value >= TASK_SIZE_OF(child))
342 return -EIO;
343 if (child->thread.gs != value)
344 return do_arch_prctl(child, ARCH_SET_GS, value);
345 return 0;
346#endif
347 }
348
349 *pt_regs_access(task_pt_regs(child), offset) = value;
350 return 0;
351}
352
353static unsigned long getreg(struct task_struct *task, unsigned long offset)
354{
355 switch (offset) {
356 case offsetof(struct user_regs_struct, cs):
357 case offsetof(struct user_regs_struct, ds):
358 case offsetof(struct user_regs_struct, es):
359 case offsetof(struct user_regs_struct, fs):
360 case offsetof(struct user_regs_struct, gs):
361 case offsetof(struct user_regs_struct, ss):
362 return get_segment_reg(task, offset);
363
364 case offsetof(struct user_regs_struct, flags):
365 return get_flags(task);
366
367#ifdef CONFIG_X86_64
368 case offsetof(struct user_regs_struct, fs_base): {
369 /*
370 * do_arch_prctl may have used a GDT slot instead of
371 * the MSR. To userland, it appears the same either
372 * way, except the %fs segment selector might not be 0.
373 */
374 unsigned int seg = task->thread.fsindex;
375 if (task->thread.fs != 0)
376 return task->thread.fs;
377 if (task == current)
378 asm("movl %%fs,%0" : "=r" (seg));
379 if (seg != FS_TLS_SEL)
380 return 0;
381 return get_desc_base(&task->thread.tls_array[FS_TLS]);
382 }
383 case offsetof(struct user_regs_struct, gs_base): {
384 /*
385 * Exactly the same here as the %fs handling above.
386 */
387 unsigned int seg = task->thread.gsindex;
388 if (task->thread.gs != 0)
389 return task->thread.gs;
390 if (task == current)
391 asm("movl %%gs,%0" : "=r" (seg));
392 if (seg != GS_TLS_SEL)
393 return 0;
394 return get_desc_base(&task->thread.tls_array[GS_TLS]);
395 }
396#endif
397 }
398
399 return *pt_regs_access(task_pt_regs(task), offset);
400}
401
402static int genregs_get(struct task_struct *target,
403 const struct user_regset *regset,
404 unsigned int pos, unsigned int count,
405 void *kbuf, void __user *ubuf)
406{
407 if (kbuf) {
408 unsigned long *k = kbuf;
409 while (count > 0) {
410 *k++ = getreg(target, pos);
411 count -= sizeof(*k);
412 pos += sizeof(*k);
413 }
414 } else {
415 unsigned long __user *u = ubuf;
416 while (count > 0) {
417 if (__put_user(getreg(target, pos), u++))
418 return -EFAULT;
419 count -= sizeof(*u);
420 pos += sizeof(*u);
421 }
422 }
423
424 return 0;
425}
426
427static int genregs_set(struct task_struct *target,
428 const struct user_regset *regset,
429 unsigned int pos, unsigned int count,
430 const void *kbuf, const void __user *ubuf)
431{
432 int ret = 0;
433 if (kbuf) {
434 const unsigned long *k = kbuf;
435 while (count > 0 && !ret) {
436 ret = putreg(target, pos, *k++);
437 count -= sizeof(*k);
438 pos += sizeof(*k);
439 }
440 } else {
441 const unsigned long __user *u = ubuf;
442 while (count > 0 && !ret) {
443 unsigned long word;
444 ret = __get_user(word, u++);
445 if (ret)
446 break;
447 ret = putreg(target, pos, word);
448 count -= sizeof(*u);
449 pos += sizeof(*u);
450 }
451 }
452 return ret;
453}
454
455/*
456 * This function is trivial and will be inlined by the compiler.
457 * Having it separates the implementation details of debug
458 * registers from the interface details of ptrace.
459 */
460static unsigned long ptrace_get_debugreg(struct task_struct *child, int n)
461{
462 switch (n) {
463 case 0: return child->thread.debugreg0;
464 case 1: return child->thread.debugreg1;
465 case 2: return child->thread.debugreg2;
466 case 3: return child->thread.debugreg3;
467 case 6: return child->thread.debugreg6;
468 case 7: return child->thread.debugreg7;
469 }
470 return 0;
471}
472
473static int ptrace_set_debugreg(struct task_struct *child,
474 int n, unsigned long data)
475{
476 int i;
477
478 if (unlikely(n == 4 || n == 5))
479 return -EIO;
480
481 if (n < 4 && unlikely(data >= debugreg_addr_limit(child)))
482 return -EIO;
483
484 switch (n) {
485 case 0: child->thread.debugreg0 = data; break;
486 case 1: child->thread.debugreg1 = data; break;
487 case 2: child->thread.debugreg2 = data; break;
488 case 3: child->thread.debugreg3 = data; break;
489
490 case 6:
491 if ((data & ~0xffffffffUL) != 0)
492 return -EIO;
493 child->thread.debugreg6 = data;
494 break;
495
496 case 7:
497 /*
498 * Sanity-check data. Take one half-byte at once with
499 * check = (val >> (16 + 4*i)) & 0xf. It contains the
500 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
501 * 2 and 3 are LENi. Given a list of invalid values,
502 * we do mask |= 1 << invalid_value, so that
503 * (mask >> check) & 1 is a correct test for invalid
504 * values.
505 *
506 * R/Wi contains the type of the breakpoint /
507 * watchpoint, LENi contains the length of the watched
508 * data in the watchpoint case.
509 *
510 * The invalid values are:
511 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
512 * - R/Wi == 0x10 (break on I/O reads or writes), so
513 * mask |= 0x4444.
514 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
515 * 0x1110.
516 *
517 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
518 *
519 * See the Intel Manual "System Programming Guide",
520 * 15.2.4
521 *
522 * Note that LENi == 0x10 is defined on x86_64 in long
523 * mode (i.e. even for 32-bit userspace software, but
524 * 64-bit kernel), so the x86_64 mask value is 0x5454.
525 * See the AMD manual no. 24593 (AMD64 System Programming)
526 */
527#ifdef CONFIG_X86_32
528#define DR7_MASK 0x5f54
529#else
530#define DR7_MASK 0x5554
531#endif
532 data &= ~DR_CONTROL_RESERVED;
533 for (i = 0; i < 4; i++)
534 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1)
535 return -EIO;
536 child->thread.debugreg7 = data;
537 if (data)
538 set_tsk_thread_flag(child, TIF_DEBUG);
539 else
540 clear_tsk_thread_flag(child, TIF_DEBUG);
541 break;
542 }
543
544 return 0;
545}
546
547static int ptrace_bts_get_size(struct task_struct *child)
548{
549 if (!child->thread.ds_area_msr)
550 return -ENXIO;
551
552 return ds_get_bts_index((void *)child->thread.ds_area_msr);
553}
554
555static int ptrace_bts_read_record(struct task_struct *child,
556 long index,
557 struct bts_struct __user *out)
558{
559 struct bts_struct ret;
560 int retval;
561 int bts_end;
562 int bts_index;
563
564 if (!child->thread.ds_area_msr)
565 return -ENXIO;
566
567 if (index < 0)
568 return -EINVAL;
569
570 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
571 if (bts_end <= index)
572 return -EINVAL;
573
574 /* translate the ptrace bts index into the ds bts index */
575 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
576 bts_index -= (index + 1);
577 if (bts_index < 0)
578 bts_index += bts_end;
579
580 retval = ds_read_bts((void *)child->thread.ds_area_msr,
581 bts_index, &ret);
582 if (retval < 0)
583 return retval;
584
585 if (copy_to_user(out, &ret, sizeof(ret)))
586 return -EFAULT;
587
588 return sizeof(ret);
589}
590
591static int ptrace_bts_write_record(struct task_struct *child,
592 const struct bts_struct *in)
593{
594 int retval;
595
596 if (!child->thread.ds_area_msr)
597 return -ENXIO;
598
599 retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
600 if (retval)
601 return retval;
602
603 return sizeof(*in);
604}
605
606static int ptrace_bts_clear(struct task_struct *child)
607{
608 if (!child->thread.ds_area_msr)
609 return -ENXIO;
610
611 return ds_clear((void *)child->thread.ds_area_msr);
612}
613
614static int ptrace_bts_drain(struct task_struct *child,
615 long size,
616 struct bts_struct __user *out)
617{
618 int end, i;
619 void *ds = (void *)child->thread.ds_area_msr;
620
621 if (!ds)
622 return -ENXIO;
623
624 end = ds_get_bts_index(ds);
625 if (end <= 0)
626 return end;
627
628 if (size < (end * sizeof(struct bts_struct)))
629 return -EIO;
630
631 for (i = 0; i < end; i++, out++) {
632 struct bts_struct ret;
633 int retval;
634
635 retval = ds_read_bts(ds, i, &ret);
636 if (retval < 0)
637 return retval;
638
639 if (copy_to_user(out, &ret, sizeof(ret)))
640 return -EFAULT;
641 }
642
643 ds_clear(ds);
644
645 return end;
646}
647
648static int ptrace_bts_realloc(struct task_struct *child,
649 int size, int reduce_size)
650{
651 unsigned long rlim, vm;
652 int ret, old_size;
653
654 if (size < 0)
655 return -EINVAL;
656
657 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
658 if (old_size < 0)
659 return old_size;
660
661 ret = ds_free((void **)&child->thread.ds_area_msr);
662 if (ret < 0)
663 goto out;
664
665 size >>= PAGE_SHIFT;
666 old_size >>= PAGE_SHIFT;
667
668 current->mm->total_vm -= old_size;
669 current->mm->locked_vm -= old_size;
670
671 if (size == 0)
672 goto out;
673
674 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
675 vm = current->mm->total_vm + size;
676 if (rlim < vm) {
677 ret = -ENOMEM;
678
679 if (!reduce_size)
680 goto out;
681
682 size = rlim - current->mm->total_vm;
683 if (size <= 0)
684 goto out;
685 }
686
687 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
688 vm = current->mm->locked_vm + size;
689 if (rlim < vm) {
690 ret = -ENOMEM;
691
692 if (!reduce_size)
693 goto out;
694
695 size = rlim - current->mm->locked_vm;
696 if (size <= 0)
697 goto out;
698 }
699
700 ret = ds_allocate((void **)&child->thread.ds_area_msr,
701 size << PAGE_SHIFT);
702 if (ret < 0)
703 goto out;
704
705 current->mm->total_vm += size;
706 current->mm->locked_vm += size;
707
708out:
709 if (child->thread.ds_area_msr)
710 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
711 else
712 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
713
714 return ret;
715}
716
717static int ptrace_bts_config(struct task_struct *child,
718 long cfg_size,
719 const struct ptrace_bts_config __user *ucfg)
720{
721 struct ptrace_bts_config cfg;
722 int bts_size, ret = 0;
723 void *ds;
724
725 if (cfg_size < sizeof(cfg))
726 return -EIO;
727
728 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
729 return -EFAULT;
730
731 if ((int)cfg.size < 0)
732 return -EINVAL;
733
734 bts_size = 0;
735 ds = (void *)child->thread.ds_area_msr;
736 if (ds) {
737 bts_size = ds_get_bts_size(ds);
738 if (bts_size < 0)
739 return bts_size;
740 }
741 cfg.size = PAGE_ALIGN(cfg.size);
742
743 if (bts_size != cfg.size) {
744 ret = ptrace_bts_realloc(child, cfg.size,
745 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
746 if (ret < 0)
747 goto errout;
748
749 ds = (void *)child->thread.ds_area_msr;
750 }
751
752 if (cfg.flags & PTRACE_BTS_O_SIGNAL)
753 ret = ds_set_overflow(ds, DS_O_SIGNAL);
754 else
755 ret = ds_set_overflow(ds, DS_O_WRAP);
756 if (ret < 0)
757 goto errout;
758
759 if (cfg.flags & PTRACE_BTS_O_TRACE)
760 child->thread.debugctlmsr |= ds_debugctl_mask();
761 else
762 child->thread.debugctlmsr &= ~ds_debugctl_mask();
763
764 if (cfg.flags & PTRACE_BTS_O_SCHED)
765 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
766 else
767 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
768
769 ret = sizeof(cfg);
770
771out:
772 if (child->thread.debugctlmsr)
773 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
774 else
775 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
776
777 return ret;
778
779errout:
780 child->thread.debugctlmsr &= ~ds_debugctl_mask();
781 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
782 goto out;
783}
784
785static int ptrace_bts_status(struct task_struct *child,
786 long cfg_size,
787 struct ptrace_bts_config __user *ucfg)
788{
789 void *ds = (void *)child->thread.ds_area_msr;
790 struct ptrace_bts_config cfg;
791
792 if (cfg_size < sizeof(cfg))
793 return -EIO;
794
795 memset(&cfg, 0, sizeof(cfg));
796
797 if (ds) {
798 cfg.size = ds_get_bts_size(ds);
799
800 if (ds_get_overflow(ds) == DS_O_SIGNAL)
801 cfg.flags |= PTRACE_BTS_O_SIGNAL;
802
803 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
804 child->thread.debugctlmsr & ds_debugctl_mask())
805 cfg.flags |= PTRACE_BTS_O_TRACE;
806
807 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
808 cfg.flags |= PTRACE_BTS_O_SCHED;
809 }
810
811 cfg.bts_size = sizeof(struct bts_struct);
812
813 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
814 return -EFAULT;
815
816 return sizeof(cfg);
817}
818
819void ptrace_bts_take_timestamp(struct task_struct *tsk,
820 enum bts_qualifier qualifier)
821{
822 struct bts_struct rec = {
823 .qualifier = qualifier,
824 .variant.jiffies = jiffies_64
825 };
826
827 ptrace_bts_write_record(tsk, &rec);
828}
829
830/*
831 * Called by kernel/ptrace.c when detaching..
832 *
833 * Make sure the single step bit is not set.
834 */
835void ptrace_disable(struct task_struct *child)
836{
837 user_disable_single_step(child);
838#ifdef TIF_SYSCALL_EMU
839 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
840#endif
841 if (child->thread.ds_area_msr) {
842 ptrace_bts_realloc(child, 0, 0);
843 child->thread.debugctlmsr &= ~ds_debugctl_mask();
844 if (!child->thread.debugctlmsr)
845 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
846 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
847 }
848}
849
850#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
851static const struct user_regset_view user_x86_32_view; /* Initialized below. */
852#endif
853
854long arch_ptrace(struct task_struct *child, long request, long addr, long data)
855{
856 int ret;
857 unsigned long __user *datap = (unsigned long __user *)data;
858
859 switch (request) {
860 /* read the word at location addr in the USER area. */
861 case PTRACE_PEEKUSR: {
862 unsigned long tmp;
863
864 ret = -EIO;
865 if ((addr & (sizeof(data) - 1)) || addr < 0 ||
866 addr >= sizeof(struct user))
867 break;
868
869 tmp = 0; /* Default return condition */
870 if (addr < sizeof(struct user_regs_struct))
871 tmp = getreg(child, addr);
872 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
873 addr <= offsetof(struct user, u_debugreg[7])) {
874 addr -= offsetof(struct user, u_debugreg[0]);
875 tmp = ptrace_get_debugreg(child, addr / sizeof(data));
876 }
877 ret = put_user(tmp, datap);
878 break;
879 }
880
881 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
882 ret = -EIO;
883 if ((addr & (sizeof(data) - 1)) || addr < 0 ||
884 addr >= sizeof(struct user))
885 break;
886
887 if (addr < sizeof(struct user_regs_struct))
888 ret = putreg(child, addr, data);
889 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
890 addr <= offsetof(struct user, u_debugreg[7])) {
891 addr -= offsetof(struct user, u_debugreg[0]);
892 ret = ptrace_set_debugreg(child,
893 addr / sizeof(data), data);
894 }
895 break;
896
897 case PTRACE_GETREGS: /* Get all gp regs from the child. */
898 return copy_regset_to_user(child,
899 task_user_regset_view(current),
900 REGSET_GENERAL,
901 0, sizeof(struct user_regs_struct),
902 datap);
903
904 case PTRACE_SETREGS: /* Set all gp regs in the child. */
905 return copy_regset_from_user(child,
906 task_user_regset_view(current),
907 REGSET_GENERAL,
908 0, sizeof(struct user_regs_struct),
909 datap);
910
911 case PTRACE_GETFPREGS: /* Get the child FPU state. */
912 return copy_regset_to_user(child,
913 task_user_regset_view(current),
914 REGSET_FP,
915 0, sizeof(struct user_i387_struct),
916 datap);
917
918 case PTRACE_SETFPREGS: /* Set the child FPU state. */
919 return copy_regset_from_user(child,
920 task_user_regset_view(current),
921 REGSET_FP,
922 0, sizeof(struct user_i387_struct),
923 datap);
924
925#ifdef CONFIG_X86_32
926 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
927 return copy_regset_to_user(child, &user_x86_32_view,
928 REGSET_XFP,
929 0, sizeof(struct user_fxsr_struct),
930 datap);
931
932 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
933 return copy_regset_from_user(child, &user_x86_32_view,
934 REGSET_XFP,
935 0, sizeof(struct user_fxsr_struct),
936 datap);
937#endif
938
939#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
940 case PTRACE_GET_THREAD_AREA:
941 if (addr < 0)
942 return -EIO;
943 ret = do_get_thread_area(child, addr,
944 (struct user_desc __user *) data);
945 break;
946
947 case PTRACE_SET_THREAD_AREA:
948 if (addr < 0)
949 return -EIO;
950 ret = do_set_thread_area(child, addr,
951 (struct user_desc __user *) data, 0);
952 break;
953#endif
954
955#ifdef CONFIG_X86_64
956 /* normal 64bit interface to access TLS data.
957 Works just like arch_prctl, except that the arguments
958 are reversed. */
959 case PTRACE_ARCH_PRCTL:
960 ret = do_arch_prctl(child, data, addr);
961 break;
962#endif
963
964 case PTRACE_BTS_CONFIG:
965 ret = ptrace_bts_config
966 (child, data, (struct ptrace_bts_config __user *)addr);
967 break;
968
969 case PTRACE_BTS_STATUS:
970 ret = ptrace_bts_status
971 (child, data, (struct ptrace_bts_config __user *)addr);
972 break;
973
974 case PTRACE_BTS_SIZE:
975 ret = ptrace_bts_get_size(child);
976 break;
977
978 case PTRACE_BTS_GET:
979 ret = ptrace_bts_read_record
980 (child, data, (struct bts_struct __user *) addr);
981 break;
982
983 case PTRACE_BTS_CLEAR:
984 ret = ptrace_bts_clear(child);
985 break;
986
987 case PTRACE_BTS_DRAIN:
988 ret = ptrace_bts_drain
989 (child, data, (struct bts_struct __user *) addr);
990 break;
991
992 default:
993 ret = ptrace_request(child, request, addr, data);
994 break;
995 }
996
997 return ret;
998}
999
1000#ifdef CONFIG_IA32_EMULATION
1001
1002#include <linux/compat.h>
1003#include <linux/syscalls.h>
1004#include <asm/ia32.h>
1005#include <asm/user32.h>
1006
1007#define R32(l,q) \
1008 case offsetof(struct user32, regs.l): \
1009 regs->q = value; break
1010
1011#define SEG32(rs) \
1012 case offsetof(struct user32, regs.rs): \
1013 return set_segment_reg(child, \
1014 offsetof(struct user_regs_struct, rs), \
1015 value); \
1016 break
1017
1018static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1019{
1020 struct pt_regs *regs = task_pt_regs(child);
1021
1022 switch (regno) {
1023
1024 SEG32(cs);
1025 SEG32(ds);
1026 SEG32(es);
1027 SEG32(fs);
1028 SEG32(gs);
1029 SEG32(ss);
1030
1031 R32(ebx, bx);
1032 R32(ecx, cx);
1033 R32(edx, dx);
1034 R32(edi, di);
1035 R32(esi, si);
1036 R32(ebp, bp);
1037 R32(eax, ax);
1038 R32(orig_eax, orig_ax);
1039 R32(eip, ip);
1040 R32(esp, sp);
1041
1042 case offsetof(struct user32, regs.eflags):
1043 return set_flags(child, value);
1044
1045 case offsetof(struct user32, u_debugreg[0]) ...
1046 offsetof(struct user32, u_debugreg[7]):
1047 regno -= offsetof(struct user32, u_debugreg[0]);
1048 return ptrace_set_debugreg(child, regno / 4, value);
1049
1050 default:
1051 if (regno > sizeof(struct user32) || (regno & 3))
1052 return -EIO;
1053
1054 /*
1055 * Other dummy fields in the virtual user structure
1056 * are ignored
1057 */
1058 break;
1059 }
1060 return 0;
1061}
1062
1063#undef R32
1064#undef SEG32
1065
1066#define R32(l,q) \
1067 case offsetof(struct user32, regs.l): \
1068 *val = regs->q; break
1069
1070#define SEG32(rs) \
1071 case offsetof(struct user32, regs.rs): \
1072 *val = get_segment_reg(child, \
1073 offsetof(struct user_regs_struct, rs)); \
1074 break
1075
1076static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
1077{
1078 struct pt_regs *regs = task_pt_regs(child);
1079
1080 switch (regno) {
1081
1082 SEG32(ds);
1083 SEG32(es);
1084 SEG32(fs);
1085 SEG32(gs);
1086
1087 R32(cs, cs);
1088 R32(ss, ss);
1089 R32(ebx, bx);
1090 R32(ecx, cx);
1091 R32(edx, dx);
1092 R32(edi, di);
1093 R32(esi, si);
1094 R32(ebp, bp);
1095 R32(eax, ax);
1096 R32(orig_eax, orig_ax);
1097 R32(eip, ip);
1098 R32(esp, sp);
1099
1100 case offsetof(struct user32, regs.eflags):
1101 *val = get_flags(child);
1102 break;
1103
1104 case offsetof(struct user32, u_debugreg[0]) ...
1105 offsetof(struct user32, u_debugreg[7]):
1106 regno -= offsetof(struct user32, u_debugreg[0]);
1107 *val = ptrace_get_debugreg(child, regno / 4);
1108 break;
1109
1110 default:
1111 if (regno > sizeof(struct user32) || (regno & 3))
1112 return -EIO;
1113
1114 /*
1115 * Other dummy fields in the virtual user structure
1116 * are ignored
1117 */
1118 *val = 0;
1119 break;
1120 }
1121 return 0;
1122}
1123
1124#undef R32
1125#undef SEG32
1126
1127static int genregs32_get(struct task_struct *target,
1128 const struct user_regset *regset,
1129 unsigned int pos, unsigned int count,
1130 void *kbuf, void __user *ubuf)
1131{
1132 if (kbuf) {
1133 compat_ulong_t *k = kbuf;
1134 while (count > 0) {
1135 getreg32(target, pos, k++);
1136 count -= sizeof(*k);
1137 pos += sizeof(*k);
1138 }
1139 } else {
1140 compat_ulong_t __user *u = ubuf;
1141 while (count > 0) {
1142 compat_ulong_t word;
1143 getreg32(target, pos, &word);
1144 if (__put_user(word, u++))
1145 return -EFAULT;
1146 count -= sizeof(*u);
1147 pos += sizeof(*u);
1148 }
1149 }
1150
1151 return 0;
1152}
1153
1154static int genregs32_set(struct task_struct *target,
1155 const struct user_regset *regset,
1156 unsigned int pos, unsigned int count,
1157 const void *kbuf, const void __user *ubuf)
1158{
1159 int ret = 0;
1160 if (kbuf) {
1161 const compat_ulong_t *k = kbuf;
1162 while (count > 0 && !ret) {
1163 ret = putreg(target, pos, *k++);
1164 count -= sizeof(*k);
1165 pos += sizeof(*k);
1166 }
1167 } else {
1168 const compat_ulong_t __user *u = ubuf;
1169 while (count > 0 && !ret) {
1170 compat_ulong_t word;
1171 ret = __get_user(word, u++);
1172 if (ret)
1173 break;
1174 ret = putreg(target, pos, word);
1175 count -= sizeof(*u);
1176 pos += sizeof(*u);
1177 }
1178 }
1179 return ret;
1180}
1181
1182static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
1183{
1184 siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
1185 compat_siginfo_t __user *si32 = compat_ptr(data);
1186 siginfo_t ssi;
1187 int ret;
1188
1189 if (request == PTRACE_SETSIGINFO) {
1190 memset(&ssi, 0, sizeof(siginfo_t));
1191 ret = copy_siginfo_from_user32(&ssi, si32);
1192 if (ret)
1193 return ret;
1194 if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
1195 return -EFAULT;
1196 }
1197 ret = sys_ptrace(request, pid, addr, (unsigned long)si);
1198 if (ret)
1199 return ret;
1200 if (request == PTRACE_GETSIGINFO) {
1201 if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
1202 return -EFAULT;
1203 ret = copy_siginfo_to_user32(si32, &ssi);
1204 }
1205 return ret;
1206}
1207
1208asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
1209{
1210 struct task_struct *child;
1211 struct pt_regs *childregs;
1212 void __user *datap = compat_ptr(data);
1213 int ret;
1214 __u32 val;
1215
1216 switch (request) {
1217 case PTRACE_TRACEME:
1218 case PTRACE_ATTACH:
1219 case PTRACE_KILL:
1220 case PTRACE_CONT:
1221 case PTRACE_SINGLESTEP:
1222 case PTRACE_SINGLEBLOCK:
1223 case PTRACE_DETACH:
1224 case PTRACE_SYSCALL:
1225 case PTRACE_OLDSETOPTIONS:
1226 case PTRACE_SETOPTIONS:
1227 case PTRACE_SET_THREAD_AREA:
1228 case PTRACE_GET_THREAD_AREA:
1229 case PTRACE_BTS_CONFIG:
1230 case PTRACE_BTS_STATUS:
1231 case PTRACE_BTS_SIZE:
1232 case PTRACE_BTS_GET:
1233 case PTRACE_BTS_CLEAR:
1234 case PTRACE_BTS_DRAIN:
1235 return sys_ptrace(request, pid, addr, data);
1236
1237 default:
1238 return -EINVAL;
1239
1240 case PTRACE_PEEKTEXT:
1241 case PTRACE_PEEKDATA:
1242 case PTRACE_POKEDATA:
1243 case PTRACE_POKETEXT:
1244 case PTRACE_POKEUSR:
1245 case PTRACE_PEEKUSR:
1246 case PTRACE_GETREGS:
1247 case PTRACE_SETREGS:
1248 case PTRACE_SETFPREGS:
1249 case PTRACE_GETFPREGS:
1250 case PTRACE_SETFPXREGS:
1251 case PTRACE_GETFPXREGS:
1252 case PTRACE_GETEVENTMSG:
1253 break;
1254
1255 case PTRACE_SETSIGINFO:
1256 case PTRACE_GETSIGINFO:
1257 return ptrace32_siginfo(request, pid, addr, data);
1258 }
1259
1260 child = ptrace_get_task_struct(pid);
1261 if (IS_ERR(child))
1262 return PTR_ERR(child);
1263
1264 ret = ptrace_check_attach(child, request == PTRACE_KILL);
1265 if (ret < 0)
1266 goto out;
1267
1268 childregs = task_pt_regs(child);
1269
1270 switch (request) {
1271 case PTRACE_PEEKUSR:
1272 ret = getreg32(child, addr, &val);
1273 if (ret == 0)
1274 ret = put_user(val, (__u32 __user *)datap);
1275 break;
1276
1277 case PTRACE_POKEUSR:
1278 ret = putreg32(child, addr, data);
1279 break;
1280
1281 case PTRACE_GETREGS: /* Get all gp regs from the child. */
1282 return copy_regset_to_user(child, &user_x86_32_view,
1283 REGSET_GENERAL,
1284 0, sizeof(struct user_regs_struct32),
1285 datap);
1286
1287 case PTRACE_SETREGS: /* Set all gp regs in the child. */
1288 return copy_regset_from_user(child, &user_x86_32_view,
1289 REGSET_GENERAL, 0,
1290 sizeof(struct user_regs_struct32),
1291 datap);
1292
1293 case PTRACE_GETFPREGS: /* Get the child FPU state. */
1294 return copy_regset_to_user(child, &user_x86_32_view,
1295 REGSET_FP, 0,
1296 sizeof(struct user_i387_ia32_struct),
1297 datap);
1298
1299 case PTRACE_SETFPREGS: /* Set the child FPU state. */
1300 return copy_regset_from_user(
1301 child, &user_x86_32_view, REGSET_FP,
1302 0, sizeof(struct user_i387_ia32_struct), datap);
1303
1304 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
1305 return copy_regset_to_user(child, &user_x86_32_view,
1306 REGSET_XFP, 0,
1307 sizeof(struct user32_fxsr_struct),
1308 datap);
1309
1310 case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
1311 return copy_regset_from_user(child, &user_x86_32_view,
1312 REGSET_XFP, 0,
1313 sizeof(struct user32_fxsr_struct),
1314 datap);
1315
1316 default:
1317 return compat_ptrace_request(child, request, addr, data);
1318 }
1319
1320 out:
1321 put_task_struct(child);
1322 return ret;
1323}
1324
1325#endif /* CONFIG_IA32_EMULATION */
1326
1327#ifdef CONFIG_X86_64
1328
1329static const struct user_regset x86_64_regsets[] = {
1330 [REGSET_GENERAL] = {
1331 .core_note_type = NT_PRSTATUS,
1332 .n = sizeof(struct user_regs_struct) / sizeof(long),
1333 .size = sizeof(long), .align = sizeof(long),
1334 .get = genregs_get, .set = genregs_set
1335 },
1336 [REGSET_FP] = {
1337 .core_note_type = NT_PRFPREG,
1338 .n = sizeof(struct user_i387_struct) / sizeof(long),
1339 .size = sizeof(long), .align = sizeof(long),
1340 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1341 },
1342};
1343
1344static const struct user_regset_view user_x86_64_view = {
1345 .name = "x86_64", .e_machine = EM_X86_64,
1346 .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets)
1347};
1348
1349#else /* CONFIG_X86_32 */
1350
1351#define user_regs_struct32 user_regs_struct
1352#define genregs32_get genregs_get
1353#define genregs32_set genregs_set
1354
1355#endif /* CONFIG_X86_64 */
1356
1357#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1358static const struct user_regset x86_32_regsets[] = {
1359 [REGSET_GENERAL] = {
1360 .core_note_type = NT_PRSTATUS,
1361 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
1362 .size = sizeof(u32), .align = sizeof(u32),
1363 .get = genregs32_get, .set = genregs32_set
1364 },
1365 [REGSET_FP] = {
1366 .core_note_type = NT_PRFPREG,
1367 .n = sizeof(struct user_i387_struct) / sizeof(u32),
1368 .size = sizeof(u32), .align = sizeof(u32),
1369 .active = fpregs_active, .get = fpregs_get, .set = fpregs_set
1370 },
1371 [REGSET_XFP] = {
1372 .core_note_type = NT_PRXFPREG,
1373 .n = sizeof(struct user_i387_struct) / sizeof(u32),
1374 .size = sizeof(u32), .align = sizeof(u32),
1375 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1376 },
1377 [REGSET_TLS] = {
1378 .core_note_type = NT_386_TLS,
1379 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
1380 .size = sizeof(struct user_desc),
1381 .align = sizeof(struct user_desc),
1382 .active = regset_tls_active,
1383 .get = regset_tls_get, .set = regset_tls_set
1384 },
1385};
1386
1387static const struct user_regset_view user_x86_32_view = {
1388 .name = "i386", .e_machine = EM_386,
1389 .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets)
1390};
1391#endif
1392
1393const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1394{
1395#ifdef CONFIG_IA32_EMULATION
1396 if (test_tsk_thread_flag(task, TIF_IA32))
1397#endif
1398#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1399 return &user_x86_32_view;
1400#endif
1401#ifdef CONFIG_X86_64
1402 return &user_x86_64_view;
1403#endif
1404}
1405
1406#ifdef CONFIG_X86_32
1407
1408void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1409{
1410 struct siginfo info;
1411
1412 tsk->thread.trap_no = 1;
1413 tsk->thread.error_code = error_code;
1414
1415 memset(&info, 0, sizeof(info));
1416 info.si_signo = SIGTRAP;
1417 info.si_code = TRAP_BRKPT;
1418
1419 /* User-mode ip? */
1420 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
1421
1422 /* Send us the fake SIGTRAP */
1423 force_sig_info(SIGTRAP, &info, tsk);
1424}
1425
1426/* notification of system call entry/exit
1427 * - triggered by current->work.syscall_trace
1428 */
1429__attribute__((regparm(3)))
1430int do_syscall_trace(struct pt_regs *regs, int entryexit)
1431{
1432 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1433 /*
1434 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1435 * interception
1436 */
1437 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1438 int ret = 0;
1439
1440 /* do the secure computing check first */
1441 if (!entryexit)
1442 secure_computing(regs->orig_ax);
1443
1444 if (unlikely(current->audit_context)) {
1445 if (entryexit)
1446 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1447 regs->ax);
1448 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1449 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1450 * not used, entry.S will call us only on syscall exit, not
1451 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1452 * calling send_sigtrap() on syscall entry.
1453 *
1454 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1455 * is_singlestep is false, despite his name, so we will still do
1456 * the correct thing.
1457 */
1458 else if (is_singlestep)
1459 goto out;
1460 }
1461
1462 if (!(current->ptrace & PT_PTRACED))
1463 goto out;
1464
1465 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1466 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1467 * here. We have to check this and return */
1468 if (is_sysemu && entryexit)
1469 return 0;
1470
1471 /* Fake a debug trap */
1472 if (is_singlestep)
1473 send_sigtrap(current, regs, 0);
1474
1475 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1476 goto out;
1477
1478 /* the 0x80 provides a way for the tracing parent to distinguish
1479 between a syscall stop and SIGTRAP delivery */
1480 /* Note that the debugger could change the result of test_thread_flag!*/
1481 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1482
1483 /*
1484 * this isn't the same as continuing with a signal, but it will do
1485 * for normal use. strace only continues with a signal if the
1486 * stopping signal is not SIGTRAP. -brl
1487 */
1488 if (current->exit_code) {
1489 send_sig(current->exit_code, current, 1);
1490 current->exit_code = 0;
1491 }
1492 ret = is_sysemu;
1493out:
1494 if (unlikely(current->audit_context) && !entryexit)
1495 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1496 regs->bx, regs->cx, regs->dx, regs->si);
1497 if (ret == 0)
1498 return 0;
1499
1500 regs->orig_ax = -1; /* force skip of syscall restarting */
1501 if (unlikely(current->audit_context))
1502 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1503 return 1;
1504}
1505
1506#else /* CONFIG_X86_64 */
1507
1508static void syscall_trace(struct pt_regs *regs)
1509{
1510
1511#if 0
1512 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1513 current->comm,
1514 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1515 current_thread_info()->flags, current->ptrace);
1516#endif
1517
1518 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1519 ? 0x80 : 0));
1520 /*
1521 * this isn't the same as continuing with a signal, but it will do
1522 * for normal use. strace only continues with a signal if the
1523 * stopping signal is not SIGTRAP. -brl
1524 */
1525 if (current->exit_code) {
1526 send_sig(current->exit_code, current, 1);
1527 current->exit_code = 0;
1528 }
1529}
1530
1531asmlinkage void syscall_trace_enter(struct pt_regs *regs)
1532{
1533 /* do the secure computing check first */
1534 secure_computing(regs->orig_ax);
1535
1536 if (test_thread_flag(TIF_SYSCALL_TRACE)
1537 && (current->ptrace & PT_PTRACED))
1538 syscall_trace(regs);
1539
1540 if (unlikely(current->audit_context)) {
1541 if (test_thread_flag(TIF_IA32)) {
1542 audit_syscall_entry(AUDIT_ARCH_I386,
1543 regs->orig_ax,
1544 regs->bx, regs->cx,
1545 regs->dx, regs->si);
1546 } else {
1547 audit_syscall_entry(AUDIT_ARCH_X86_64,
1548 regs->orig_ax,
1549 regs->di, regs->si,
1550 regs->dx, regs->r10);
1551 }
1552 }
1553}
1554
1555asmlinkage void syscall_trace_leave(struct pt_regs *regs)
1556{
1557 if (unlikely(current->audit_context))
1558 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1559
1560 if ((test_thread_flag(TIF_SYSCALL_TRACE)
1561 || test_thread_flag(TIF_SINGLESTEP))
1562 && (current->ptrace & PT_PTRACED))
1563 syscall_trace(regs);
1564}
1565
1566#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c
deleted file mode 100644
index ff5431cc03ee..000000000000
--- a/arch/x86/kernel/ptrace_32.c
+++ /dev/null
@@ -1,717 +0,0 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 */
6
7#include <linux/kernel.h>
8#include <linux/sched.h>
9#include <linux/mm.h>
10#include <linux/smp.h>
11#include <linux/errno.h>
12#include <linux/ptrace.h>
13#include <linux/user.h>
14#include <linux/security.h>
15#include <linux/audit.h>
16#include <linux/seccomp.h>
17#include <linux/signal.h>
18
19#include <asm/uaccess.h>
20#include <asm/pgtable.h>
21#include <asm/system.h>
22#include <asm/processor.h>
23#include <asm/i387.h>
24#include <asm/debugreg.h>
25#include <asm/ldt.h>
26#include <asm/desc.h>
27
28/*
29 * does not yet catch signals sent when the child dies.
30 * in exit.c or in signal.c.
31 */
32
33/*
34 * Determines which flags the user has access to [1 = access, 0 = no access].
35 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9).
36 * Also masks reserved bits (31-22, 15, 5, 3, 1).
37 */
38#define FLAG_MASK 0x00050dd5
39
40/* set's the trap flag. */
41#define TRAP_FLAG 0x100
42
43/*
44 * Offset of eflags on child stack..
45 */
46#define EFL_OFFSET offsetof(struct pt_regs, eflags)
47
48static inline struct pt_regs *get_child_regs(struct task_struct *task)
49{
50 void *stack_top = (void *)task->thread.esp0;
51 return stack_top - sizeof(struct pt_regs);
52}
53
54/*
55 * This routine will get a word off of the processes privileged stack.
56 * the offset is bytes into the pt_regs structure on the stack.
57 * This routine assumes that all the privileged stacks are in our
58 * data space.
59 */
60static inline int get_stack_long(struct task_struct *task, int offset)
61{
62 unsigned char *stack;
63
64 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
65 stack += offset;
66 return (*((int *)stack));
67}
68
69/*
70 * This routine will put a word on the processes privileged stack.
71 * the offset is bytes into the pt_regs structure on the stack.
72 * This routine assumes that all the privileged stacks are in our
73 * data space.
74 */
75static inline int put_stack_long(struct task_struct *task, int offset,
76 unsigned long data)
77{
78 unsigned char * stack;
79
80 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
81 stack += offset;
82 *(unsigned long *) stack = data;
83 return 0;
84}
85
86static int putreg(struct task_struct *child,
87 unsigned long regno, unsigned long value)
88{
89 switch (regno >> 2) {
90 case GS:
91 if (value && (value & 3) != 3)
92 return -EIO;
93 child->thread.gs = value;
94 return 0;
95 case DS:
96 case ES:
97 case FS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 value &= 0xffff;
101 break;
102 case SS:
103 case CS:
104 if ((value & 3) != 3)
105 return -EIO;
106 value &= 0xffff;
107 break;
108 case EFL:
109 value &= FLAG_MASK;
110 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
111 break;
112 }
113 if (regno > FS*4)
114 regno -= 1*4;
115 put_stack_long(child, regno, value);
116 return 0;
117}
118
119static unsigned long getreg(struct task_struct *child,
120 unsigned long regno)
121{
122 unsigned long retval = ~0UL;
123
124 switch (regno >> 2) {
125 case GS:
126 retval = child->thread.gs;
127 break;
128 case DS:
129 case ES:
130 case FS:
131 case SS:
132 case CS:
133 retval = 0xffff;
134 /* fall through */
135 default:
136 if (regno > FS*4)
137 regno -= 1*4;
138 retval &= get_stack_long(child, regno);
139 }
140 return retval;
141}
142
143#define LDT_SEGMENT 4
144
145static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs)
146{
147 unsigned long addr, seg;
148
149 addr = regs->eip;
150 seg = regs->xcs & 0xffff;
151 if (regs->eflags & VM_MASK) {
152 addr = (addr & 0xffff) + (seg << 4);
153 return addr;
154 }
155
156 /*
157 * We'll assume that the code segments in the GDT
158 * are all zero-based. That is largely true: the
159 * TLS segments are used for data, and the PNPBIOS
160 * and APM bios ones we just ignore here.
161 */
162 if (seg & LDT_SEGMENT) {
163 u32 *desc;
164 unsigned long base;
165
166 seg &= ~7UL;
167
168 mutex_lock(&child->mm->context.lock);
169 if (unlikely((seg >> 3) >= child->mm->context.size))
170 addr = -1L; /* bogus selector, access would fault */
171 else {
172 desc = child->mm->context.ldt + seg;
173 base = ((desc[0] >> 16) |
174 ((desc[1] & 0xff) << 16) |
175 (desc[1] & 0xff000000));
176
177 /* 16-bit code segment? */
178 if (!((desc[1] >> 22) & 1))
179 addr &= 0xffff;
180 addr += base;
181 }
182 mutex_unlock(&child->mm->context.lock);
183 }
184 return addr;
185}
186
187static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
188{
189 int i, copied;
190 unsigned char opcode[15];
191 unsigned long addr = convert_eip_to_linear(child, regs);
192
193 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
194 for (i = 0; i < copied; i++) {
195 switch (opcode[i]) {
196 /* popf and iret */
197 case 0x9d: case 0xcf:
198 return 1;
199 /* opcode and address size prefixes */
200 case 0x66: case 0x67:
201 continue;
202 /* irrelevant prefixes (segment overrides and repeats) */
203 case 0x26: case 0x2e:
204 case 0x36: case 0x3e:
205 case 0x64: case 0x65:
206 case 0xf0: case 0xf2: case 0xf3:
207 continue;
208
209 /*
210 * pushf: NOTE! We should probably not let
211 * the user see the TF bit being set. But
212 * it's more pain than it's worth to avoid
213 * it, and a debugger could emulate this
214 * all in user space if it _really_ cares.
215 */
216 case 0x9c:
217 default:
218 return 0;
219 }
220 }
221 return 0;
222}
223
224static void set_singlestep(struct task_struct *child)
225{
226 struct pt_regs *regs = get_child_regs(child);
227
228 /*
229 * Always set TIF_SINGLESTEP - this guarantees that
230 * we single-step system calls etc.. This will also
231 * cause us to set TF when returning to user mode.
232 */
233 set_tsk_thread_flag(child, TIF_SINGLESTEP);
234
235 /*
236 * If TF was already set, don't do anything else
237 */
238 if (regs->eflags & TRAP_FLAG)
239 return;
240
241 /* Set TF on the kernel stack.. */
242 regs->eflags |= TRAP_FLAG;
243
244 /*
245 * ..but if TF is changed by the instruction we will trace,
246 * don't mark it as being "us" that set it, so that we
247 * won't clear it by hand later.
248 */
249 if (is_setting_trap_flag(child, regs))
250 return;
251
252 child->ptrace |= PT_DTRACE;
253}
254
255static void clear_singlestep(struct task_struct *child)
256{
257 /* Always clear TIF_SINGLESTEP... */
258 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
259
260 /* But touch TF only if it was set by us.. */
261 if (child->ptrace & PT_DTRACE) {
262 struct pt_regs *regs = get_child_regs(child);
263 regs->eflags &= ~TRAP_FLAG;
264 child->ptrace &= ~PT_DTRACE;
265 }
266}
267
268/*
269 * Called by kernel/ptrace.c when detaching..
270 *
271 * Make sure the single step bit is not set.
272 */
273void ptrace_disable(struct task_struct *child)
274{
275 clear_singlestep(child);
276 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
277}
278
279/*
280 * Perform get_thread_area on behalf of the traced child.
281 */
282static int
283ptrace_get_thread_area(struct task_struct *child,
284 int idx, struct user_desc __user *user_desc)
285{
286 struct user_desc info;
287 struct desc_struct *desc;
288
289/*
290 * Get the current Thread-Local Storage area:
291 */
292
293#define GET_BASE(desc) ( \
294 (((desc)->a >> 16) & 0x0000ffff) | \
295 (((desc)->b << 16) & 0x00ff0000) | \
296 ( (desc)->b & 0xff000000) )
297
298#define GET_LIMIT(desc) ( \
299 ((desc)->a & 0x0ffff) | \
300 ((desc)->b & 0xf0000) )
301
302#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
303#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
304#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
305#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
306#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
307#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
308
309 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
310 return -EINVAL;
311
312 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
313
314 info.entry_number = idx;
315 info.base_addr = GET_BASE(desc);
316 info.limit = GET_LIMIT(desc);
317 info.seg_32bit = GET_32BIT(desc);
318 info.contents = GET_CONTENTS(desc);
319 info.read_exec_only = !GET_WRITABLE(desc);
320 info.limit_in_pages = GET_LIMIT_PAGES(desc);
321 info.seg_not_present = !GET_PRESENT(desc);
322 info.useable = GET_USEABLE(desc);
323
324 if (copy_to_user(user_desc, &info, sizeof(info)))
325 return -EFAULT;
326
327 return 0;
328}
329
330/*
331 * Perform set_thread_area on behalf of the traced child.
332 */
333static int
334ptrace_set_thread_area(struct task_struct *child,
335 int idx, struct user_desc __user *user_desc)
336{
337 struct user_desc info;
338 struct desc_struct *desc;
339
340 if (copy_from_user(&info, user_desc, sizeof(info)))
341 return -EFAULT;
342
343 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
344 return -EINVAL;
345
346 desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
347 if (LDT_empty(&info)) {
348 desc->a = 0;
349 desc->b = 0;
350 } else {
351 desc->a = LDT_entry_a(&info);
352 desc->b = LDT_entry_b(&info);
353 }
354
355 return 0;
356}
357
358long arch_ptrace(struct task_struct *child, long request, long addr, long data)
359{
360 struct user * dummy = NULL;
361 int i, ret;
362 unsigned long __user *datap = (unsigned long __user *)data;
363
364 switch (request) {
365 /* when I and D space are separate, these will need to be fixed. */
366 case PTRACE_PEEKTEXT: /* read word at location addr. */
367 case PTRACE_PEEKDATA:
368 ret = generic_ptrace_peekdata(child, addr, data);
369 break;
370
371 /* read the word at location addr in the USER area. */
372 case PTRACE_PEEKUSR: {
373 unsigned long tmp;
374
375 ret = -EIO;
376 if ((addr & 3) || addr < 0 ||
377 addr > sizeof(struct user) - 3)
378 break;
379
380 tmp = 0; /* Default return condition */
381 if(addr < FRAME_SIZE*sizeof(long))
382 tmp = getreg(child, addr);
383 if(addr >= (long) &dummy->u_debugreg[0] &&
384 addr <= (long) &dummy->u_debugreg[7]){
385 addr -= (long) &dummy->u_debugreg[0];
386 addr = addr >> 2;
387 tmp = child->thread.debugreg[addr];
388 }
389 ret = put_user(tmp, datap);
390 break;
391 }
392
393 /* when I and D space are separate, this will have to be fixed. */
394 case PTRACE_POKETEXT: /* write the word at location addr. */
395 case PTRACE_POKEDATA:
396 ret = generic_ptrace_pokedata(child, addr, data);
397 break;
398
399 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
400 ret = -EIO;
401 if ((addr & 3) || addr < 0 ||
402 addr > sizeof(struct user) - 3)
403 break;
404
405 if (addr < FRAME_SIZE*sizeof(long)) {
406 ret = putreg(child, addr, data);
407 break;
408 }
409 /* We need to be very careful here. We implicitly
410 want to modify a portion of the task_struct, and we
411 have to be selective about what portions we allow someone
412 to modify. */
413
414 ret = -EIO;
415 if(addr >= (long) &dummy->u_debugreg[0] &&
416 addr <= (long) &dummy->u_debugreg[7]){
417
418 if(addr == (long) &dummy->u_debugreg[4]) break;
419 if(addr == (long) &dummy->u_debugreg[5]) break;
420 if(addr < (long) &dummy->u_debugreg[4] &&
421 ((unsigned long) data) >= TASK_SIZE-3) break;
422
423 /* Sanity-check data. Take one half-byte at once with
424 * check = (val >> (16 + 4*i)) & 0xf. It contains the
425 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
426 * 2 and 3 are LENi. Given a list of invalid values,
427 * we do mask |= 1 << invalid_value, so that
428 * (mask >> check) & 1 is a correct test for invalid
429 * values.
430 *
431 * R/Wi contains the type of the breakpoint /
432 * watchpoint, LENi contains the length of the watched
433 * data in the watchpoint case.
434 *
435 * The invalid values are:
436 * - LENi == 0x10 (undefined), so mask |= 0x0f00.
437 * - R/Wi == 0x10 (break on I/O reads or writes), so
438 * mask |= 0x4444.
439 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
440 * 0x1110.
441 *
442 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
443 *
444 * See the Intel Manual "System Programming Guide",
445 * 15.2.4
446 *
447 * Note that LENi == 0x10 is defined on x86_64 in long
448 * mode (i.e. even for 32-bit userspace software, but
449 * 64-bit kernel), so the x86_64 mask value is 0x5454.
450 * See the AMD manual no. 24593 (AMD64 System
451 * Programming)*/
452
453 if(addr == (long) &dummy->u_debugreg[7]) {
454 data &= ~DR_CONTROL_RESERVED;
455 for(i=0; i<4; i++)
456 if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
457 goto out_tsk;
458 if (data)
459 set_tsk_thread_flag(child, TIF_DEBUG);
460 else
461 clear_tsk_thread_flag(child, TIF_DEBUG);
462 }
463 addr -= (long) &dummy->u_debugreg;
464 addr = addr >> 2;
465 child->thread.debugreg[addr] = data;
466 ret = 0;
467 }
468 break;
469
470 case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
471 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
472 case PTRACE_CONT: /* restart after signal. */
473 ret = -EIO;
474 if (!valid_signal(data))
475 break;
476 if (request == PTRACE_SYSEMU) {
477 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
478 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
479 } else if (request == PTRACE_SYSCALL) {
480 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
481 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
482 } else {
483 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
484 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
485 }
486 child->exit_code = data;
487 /* make sure the single step bit is not set. */
488 clear_singlestep(child);
489 wake_up_process(child);
490 ret = 0;
491 break;
492
493/*
494 * make the child exit. Best I can do is send it a sigkill.
495 * perhaps it should be put in the status that it wants to
496 * exit.
497 */
498 case PTRACE_KILL:
499 ret = 0;
500 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
501 break;
502 child->exit_code = SIGKILL;
503 /* make sure the single step bit is not set. */
504 clear_singlestep(child);
505 wake_up_process(child);
506 break;
507
508 case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
509 case PTRACE_SINGLESTEP: /* set the trap flag. */
510 ret = -EIO;
511 if (!valid_signal(data))
512 break;
513
514 if (request == PTRACE_SYSEMU_SINGLESTEP)
515 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
516 else
517 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
518
519 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
520 set_singlestep(child);
521 child->exit_code = data;
522 /* give it a chance to run. */
523 wake_up_process(child);
524 ret = 0;
525 break;
526
527 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
528 if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) {
529 ret = -EIO;
530 break;
531 }
532 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
533 __put_user(getreg(child, i), datap);
534 datap++;
535 }
536 ret = 0;
537 break;
538 }
539
540 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
541 unsigned long tmp;
542 if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) {
543 ret = -EIO;
544 break;
545 }
546 for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) {
547 __get_user(tmp, datap);
548 putreg(child, i, tmp);
549 datap++;
550 }
551 ret = 0;
552 break;
553 }
554
555 case PTRACE_GETFPREGS: { /* Get the child FPU state. */
556 if (!access_ok(VERIFY_WRITE, datap,
557 sizeof(struct user_i387_struct))) {
558 ret = -EIO;
559 break;
560 }
561 ret = 0;
562 if (!tsk_used_math(child))
563 init_fpu(child);
564 get_fpregs((struct user_i387_struct __user *)data, child);
565 break;
566 }
567
568 case PTRACE_SETFPREGS: { /* Set the child FPU state. */
569 if (!access_ok(VERIFY_READ, datap,
570 sizeof(struct user_i387_struct))) {
571 ret = -EIO;
572 break;
573 }
574 set_stopped_child_used_math(child);
575 set_fpregs(child, (struct user_i387_struct __user *)data);
576 ret = 0;
577 break;
578 }
579
580 case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */
581 if (!access_ok(VERIFY_WRITE, datap,
582 sizeof(struct user_fxsr_struct))) {
583 ret = -EIO;
584 break;
585 }
586 if (!tsk_used_math(child))
587 init_fpu(child);
588 ret = get_fpxregs((struct user_fxsr_struct __user *)data, child);
589 break;
590 }
591
592 case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */
593 if (!access_ok(VERIFY_READ, datap,
594 sizeof(struct user_fxsr_struct))) {
595 ret = -EIO;
596 break;
597 }
598 set_stopped_child_used_math(child);
599 ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data);
600 break;
601 }
602
603 case PTRACE_GET_THREAD_AREA:
604 ret = ptrace_get_thread_area(child, addr,
605 (struct user_desc __user *) data);
606 break;
607
608 case PTRACE_SET_THREAD_AREA:
609 ret = ptrace_set_thread_area(child, addr,
610 (struct user_desc __user *) data);
611 break;
612
613 default:
614 ret = ptrace_request(child, request, addr, data);
615 break;
616 }
617 out_tsk:
618 return ret;
619}
620
621void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
622{
623 struct siginfo info;
624
625 tsk->thread.trap_no = 1;
626 tsk->thread.error_code = error_code;
627
628 memset(&info, 0, sizeof(info));
629 info.si_signo = SIGTRAP;
630 info.si_code = TRAP_BRKPT;
631
632 /* User-mode eip? */
633 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL;
634
635 /* Send us the fake SIGTRAP */
636 force_sig_info(SIGTRAP, &info, tsk);
637}
638
639/* notification of system call entry/exit
640 * - triggered by current->work.syscall_trace
641 */
642__attribute__((regparm(3)))
643int do_syscall_trace(struct pt_regs *regs, int entryexit)
644{
645 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
646 /*
647 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
648 * interception
649 */
650 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
651 int ret = 0;
652
653 /* do the secure computing check first */
654 if (!entryexit)
655 secure_computing(regs->orig_eax);
656
657 if (unlikely(current->audit_context)) {
658 if (entryexit)
659 audit_syscall_exit(AUDITSC_RESULT(regs->eax),
660 regs->eax);
661 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
662 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
663 * not used, entry.S will call us only on syscall exit, not
664 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
665 * calling send_sigtrap() on syscall entry.
666 *
667 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
668 * is_singlestep is false, despite his name, so we will still do
669 * the correct thing.
670 */
671 else if (is_singlestep)
672 goto out;
673 }
674
675 if (!(current->ptrace & PT_PTRACED))
676 goto out;
677
678 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
679 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
680 * here. We have to check this and return */
681 if (is_sysemu && entryexit)
682 return 0;
683
684 /* Fake a debug trap */
685 if (is_singlestep)
686 send_sigtrap(current, regs, 0);
687
688 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
689 goto out;
690
691 /* the 0x80 provides a way for the tracing parent to distinguish
692 between a syscall stop and SIGTRAP delivery */
693 /* Note that the debugger could change the result of test_thread_flag!*/
694 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
695
696 /*
697 * this isn't the same as continuing with a signal, but it will do
698 * for normal use. strace only continues with a signal if the
699 * stopping signal is not SIGTRAP. -brl
700 */
701 if (current->exit_code) {
702 send_sig(current->exit_code, current, 1);
703 current->exit_code = 0;
704 }
705 ret = is_sysemu;
706out:
707 if (unlikely(current->audit_context) && !entryexit)
708 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax,
709 regs->ebx, regs->ecx, regs->edx, regs->esi);
710 if (ret == 0)
711 return 0;
712
713 regs->orig_eax = -1; /* force skip of syscall restarting */
714 if (unlikely(current->audit_context))
715 audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax);
716 return 1;
717}
diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c
deleted file mode 100644
index 607085f3f08a..000000000000
--- a/arch/x86/kernel/ptrace_64.c
+++ /dev/null
@@ -1,621 +0,0 @@
1/* By Ross Biro 1/23/92 */
2/*
3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * x86-64 port 2000-2002 Andi Kleen
7 */
8
9#include <linux/kernel.h>
10#include <linux/sched.h>
11#include <linux/mm.h>
12#include <linux/smp.h>
13#include <linux/errno.h>
14#include <linux/ptrace.h>
15#include <linux/user.h>
16#include <linux/security.h>
17#include <linux/audit.h>
18#include <linux/seccomp.h>
19#include <linux/signal.h>
20
21#include <asm/uaccess.h>
22#include <asm/pgtable.h>
23#include <asm/system.h>
24#include <asm/processor.h>
25#include <asm/i387.h>
26#include <asm/debugreg.h>
27#include <asm/ldt.h>
28#include <asm/desc.h>
29#include <asm/proto.h>
30#include <asm/ia32.h>
31
32/*
33 * does not yet catch signals sent when the child dies.
34 * in exit.c or in signal.c.
35 */
36
37/*
38 * Determines which flags the user has access to [1 = access, 0 = no access].
39 * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
40 * Also masks reserved bits (63-22, 15, 5, 3, 1).
41 */
42#define FLAG_MASK 0x54dd5UL
43
44/* set's the trap flag. */
45#define TRAP_FLAG 0x100UL
46
47/*
48 * eflags and offset of eflags on child stack..
49 */
50#define EFLAGS offsetof(struct pt_regs, eflags)
51#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
52
53/*
54 * this routine will get a word off of the processes privileged stack.
55 * the offset is how far from the base addr as stored in the TSS.
56 * this routine assumes that all the privileged stacks are in our
57 * data space.
58 */
59static inline unsigned long get_stack_long(struct task_struct *task, int offset)
60{
61 unsigned char *stack;
62
63 stack = (unsigned char *)task->thread.rsp0;
64 stack += offset;
65 return (*((unsigned long *)stack));
66}
67
68/*
69 * this routine will put a word on the processes privileged stack.
70 * the offset is how far from the base addr as stored in the TSS.
71 * this routine assumes that all the privileged stacks are in our
72 * data space.
73 */
74static inline long put_stack_long(struct task_struct *task, int offset,
75 unsigned long data)
76{
77 unsigned char * stack;
78
79 stack = (unsigned char *) task->thread.rsp0;
80 stack += offset;
81 *(unsigned long *) stack = data;
82 return 0;
83}
84
85#define LDT_SEGMENT 4
86
87unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
88{
89 unsigned long addr, seg;
90
91 addr = regs->rip;
92 seg = regs->cs & 0xffff;
93
94 /*
95 * We'll assume that the code segments in the GDT
96 * are all zero-based. That is largely true: the
97 * TLS segments are used for data, and the PNPBIOS
98 * and APM bios ones we just ignore here.
99 */
100 if (seg & LDT_SEGMENT) {
101 u32 *desc;
102 unsigned long base;
103
104 seg &= ~7UL;
105
106 mutex_lock(&child->mm->context.lock);
107 if (unlikely((seg >> 3) >= child->mm->context.size))
108 addr = -1L; /* bogus selector, access would fault */
109 else {
110 desc = child->mm->context.ldt + seg;
111 base = ((desc[0] >> 16) |
112 ((desc[1] & 0xff) << 16) |
113 (desc[1] & 0xff000000));
114
115 /* 16-bit code segment? */
116 if (!((desc[1] >> 22) & 1))
117 addr &= 0xffff;
118 addr += base;
119 }
120 mutex_unlock(&child->mm->context.lock);
121 }
122
123 return addr;
124}
125
126static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
127{
128 int i, copied;
129 unsigned char opcode[15];
130 unsigned long addr = convert_rip_to_linear(child, regs);
131
132 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
133 for (i = 0; i < copied; i++) {
134 switch (opcode[i]) {
135 /* popf and iret */
136 case 0x9d: case 0xcf:
137 return 1;
138
139 /* CHECKME: 64 65 */
140
141 /* opcode and address size prefixes */
142 case 0x66: case 0x67:
143 continue;
144 /* irrelevant prefixes (segment overrides and repeats) */
145 case 0x26: case 0x2e:
146 case 0x36: case 0x3e:
147 case 0x64: case 0x65:
148 case 0xf2: case 0xf3:
149 continue;
150
151 case 0x40 ... 0x4f:
152 if (regs->cs != __USER_CS)
153 /* 32-bit mode: register increment */
154 return 0;
155 /* 64-bit mode: REX prefix */
156 continue;
157
158 /* CHECKME: f2, f3 */
159
160 /*
161 * pushf: NOTE! We should probably not let
162 * the user see the TF bit being set. But
163 * it's more pain than it's worth to avoid
164 * it, and a debugger could emulate this
165 * all in user space if it _really_ cares.
166 */
167 case 0x9c:
168 default:
169 return 0;
170 }
171 }
172 return 0;
173}
174
175static void set_singlestep(struct task_struct *child)
176{
177 struct pt_regs *regs = task_pt_regs(child);
178
179 /*
180 * Always set TIF_SINGLESTEP - this guarantees that
181 * we single-step system calls etc.. This will also
182 * cause us to set TF when returning to user mode.
183 */
184 set_tsk_thread_flag(child, TIF_SINGLESTEP);
185
186 /*
187 * If TF was already set, don't do anything else
188 */
189 if (regs->eflags & TRAP_FLAG)
190 return;
191
192 /* Set TF on the kernel stack.. */
193 regs->eflags |= TRAP_FLAG;
194
195 /*
196 * ..but if TF is changed by the instruction we will trace,
197 * don't mark it as being "us" that set it, so that we
198 * won't clear it by hand later.
199 */
200 if (is_setting_trap_flag(child, regs))
201 return;
202
203 child->ptrace |= PT_DTRACE;
204}
205
206static void clear_singlestep(struct task_struct *child)
207{
208 /* Always clear TIF_SINGLESTEP... */
209 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
210
211 /* But touch TF only if it was set by us.. */
212 if (child->ptrace & PT_DTRACE) {
213 struct pt_regs *regs = task_pt_regs(child);
214 regs->eflags &= ~TRAP_FLAG;
215 child->ptrace &= ~PT_DTRACE;
216 }
217}
218
219/*
220 * Called by kernel/ptrace.c when detaching..
221 *
222 * Make sure the single step bit is not set.
223 */
224void ptrace_disable(struct task_struct *child)
225{
226 clear_singlestep(child);
227}
228
229static int putreg(struct task_struct *child,
230 unsigned long regno, unsigned long value)
231{
232 unsigned long tmp;
233
234 switch (regno) {
235 case offsetof(struct user_regs_struct,fs):
236 if (value && (value & 3) != 3)
237 return -EIO;
238 child->thread.fsindex = value & 0xffff;
239 return 0;
240 case offsetof(struct user_regs_struct,gs):
241 if (value && (value & 3) != 3)
242 return -EIO;
243 child->thread.gsindex = value & 0xffff;
244 return 0;
245 case offsetof(struct user_regs_struct,ds):
246 if (value && (value & 3) != 3)
247 return -EIO;
248 child->thread.ds = value & 0xffff;
249 return 0;
250 case offsetof(struct user_regs_struct,es):
251 if (value && (value & 3) != 3)
252 return -EIO;
253 child->thread.es = value & 0xffff;
254 return 0;
255 case offsetof(struct user_regs_struct,ss):
256 if ((value & 3) != 3)
257 return -EIO;
258 value &= 0xffff;
259 return 0;
260 case offsetof(struct user_regs_struct,fs_base):
261 if (value >= TASK_SIZE_OF(child))
262 return -EIO;
263 child->thread.fs = value;
264 return 0;
265 case offsetof(struct user_regs_struct,gs_base):
266 if (value >= TASK_SIZE_OF(child))
267 return -EIO;
268 child->thread.gs = value;
269 return 0;
270 case offsetof(struct user_regs_struct, eflags):
271 value &= FLAG_MASK;
272 tmp = get_stack_long(child, EFL_OFFSET);
273 tmp &= ~FLAG_MASK;
274 value |= tmp;
275 break;
276 case offsetof(struct user_regs_struct,cs):
277 if ((value & 3) != 3)
278 return -EIO;
279 value &= 0xffff;
280 break;
281 }
282 put_stack_long(child, regno - sizeof(struct pt_regs), value);
283 return 0;
284}
285
286static unsigned long getreg(struct task_struct *child, unsigned long regno)
287{
288 unsigned long val;
289 switch (regno) {
290 case offsetof(struct user_regs_struct, fs):
291 return child->thread.fsindex;
292 case offsetof(struct user_regs_struct, gs):
293 return child->thread.gsindex;
294 case offsetof(struct user_regs_struct, ds):
295 return child->thread.ds;
296 case offsetof(struct user_regs_struct, es):
297 return child->thread.es;
298 case offsetof(struct user_regs_struct, fs_base):
299 return child->thread.fs;
300 case offsetof(struct user_regs_struct, gs_base):
301 return child->thread.gs;
302 default:
303 regno = regno - sizeof(struct pt_regs);
304 val = get_stack_long(child, regno);
305 if (test_tsk_thread_flag(child, TIF_IA32))
306 val &= 0xffffffff;
307 return val;
308 }
309
310}
311
312long arch_ptrace(struct task_struct *child, long request, long addr, long data)
313{
314 long i, ret;
315 unsigned ui;
316
317 switch (request) {
318 /* when I and D space are separate, these will need to be fixed. */
319 case PTRACE_PEEKTEXT: /* read word at location addr. */
320 case PTRACE_PEEKDATA:
321 ret = generic_ptrace_peekdata(child, addr, data);
322 break;
323
324 /* read the word at location addr in the USER area. */
325 case PTRACE_PEEKUSR: {
326 unsigned long tmp;
327
328 ret = -EIO;
329 if ((addr & 7) ||
330 addr > sizeof(struct user) - 7)
331 break;
332
333 switch (addr) {
334 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
335 tmp = getreg(child, addr);
336 break;
337 case offsetof(struct user, u_debugreg[0]):
338 tmp = child->thread.debugreg0;
339 break;
340 case offsetof(struct user, u_debugreg[1]):
341 tmp = child->thread.debugreg1;
342 break;
343 case offsetof(struct user, u_debugreg[2]):
344 tmp = child->thread.debugreg2;
345 break;
346 case offsetof(struct user, u_debugreg[3]):
347 tmp = child->thread.debugreg3;
348 break;
349 case offsetof(struct user, u_debugreg[6]):
350 tmp = child->thread.debugreg6;
351 break;
352 case offsetof(struct user, u_debugreg[7]):
353 tmp = child->thread.debugreg7;
354 break;
355 default:
356 tmp = 0;
357 break;
358 }
359 ret = put_user(tmp,(unsigned long __user *) data);
360 break;
361 }
362
363 /* when I and D space are separate, this will have to be fixed. */
364 case PTRACE_POKETEXT: /* write the word at location addr. */
365 case PTRACE_POKEDATA:
366 ret = generic_ptrace_pokedata(child, addr, data);
367 break;
368
369 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
370 {
371 int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
372 ret = -EIO;
373 if ((addr & 7) ||
374 addr > sizeof(struct user) - 7)
375 break;
376
377 switch (addr) {
378 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
379 ret = putreg(child, addr, data);
380 break;
381 /* Disallows to set a breakpoint into the vsyscall */
382 case offsetof(struct user, u_debugreg[0]):
383 if (data >= TASK_SIZE_OF(child) - dsize) break;
384 child->thread.debugreg0 = data;
385 ret = 0;
386 break;
387 case offsetof(struct user, u_debugreg[1]):
388 if (data >= TASK_SIZE_OF(child) - dsize) break;
389 child->thread.debugreg1 = data;
390 ret = 0;
391 break;
392 case offsetof(struct user, u_debugreg[2]):
393 if (data >= TASK_SIZE_OF(child) - dsize) break;
394 child->thread.debugreg2 = data;
395 ret = 0;
396 break;
397 case offsetof(struct user, u_debugreg[3]):
398 if (data >= TASK_SIZE_OF(child) - dsize) break;
399 child->thread.debugreg3 = data;
400 ret = 0;
401 break;
402 case offsetof(struct user, u_debugreg[6]):
403 if (data >> 32)
404 break;
405 child->thread.debugreg6 = data;
406 ret = 0;
407 break;
408 case offsetof(struct user, u_debugreg[7]):
409 /* See arch/i386/kernel/ptrace.c for an explanation of
410 * this awkward check.*/
411 data &= ~DR_CONTROL_RESERVED;
412 for(i=0; i<4; i++)
413 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
414 break;
415 if (i == 4) {
416 child->thread.debugreg7 = data;
417 if (data)
418 set_tsk_thread_flag(child, TIF_DEBUG);
419 else
420 clear_tsk_thread_flag(child, TIF_DEBUG);
421 ret = 0;
422 }
423 break;
424 }
425 break;
426 }
427 case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
428 case PTRACE_CONT: /* restart after signal. */
429
430 ret = -EIO;
431 if (!valid_signal(data))
432 break;
433 if (request == PTRACE_SYSCALL)
434 set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
435 else
436 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
437 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
438 child->exit_code = data;
439 /* make sure the single step bit is not set. */
440 clear_singlestep(child);
441 wake_up_process(child);
442 ret = 0;
443 break;
444
445#ifdef CONFIG_IA32_EMULATION
446 /* This makes only sense with 32bit programs. Allow a
447 64bit debugger to fully examine them too. Better
448 don't use it against 64bit processes, use
449 PTRACE_ARCH_PRCTL instead. */
450 case PTRACE_SET_THREAD_AREA: {
451 struct user_desc __user *p;
452 int old;
453 p = (struct user_desc __user *)data;
454 get_user(old, &p->entry_number);
455 put_user(addr, &p->entry_number);
456 ret = do_set_thread_area(&child->thread, p);
457 put_user(old, &p->entry_number);
458 break;
459 case PTRACE_GET_THREAD_AREA:
460 p = (struct user_desc __user *)data;
461 get_user(old, &p->entry_number);
462 put_user(addr, &p->entry_number);
463 ret = do_get_thread_area(&child->thread, p);
464 put_user(old, &p->entry_number);
465 break;
466 }
467#endif
468 /* normal 64bit interface to access TLS data.
469 Works just like arch_prctl, except that the arguments
470 are reversed. */
471 case PTRACE_ARCH_PRCTL:
472 ret = do_arch_prctl(child, data, addr);
473 break;
474
475/*
476 * make the child exit. Best I can do is send it a sigkill.
477 * perhaps it should be put in the status that it wants to
478 * exit.
479 */
480 case PTRACE_KILL:
481 ret = 0;
482 if (child->exit_state == EXIT_ZOMBIE) /* already dead */
483 break;
484 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
485 child->exit_code = SIGKILL;
486 /* make sure the single step bit is not set. */
487 clear_singlestep(child);
488 wake_up_process(child);
489 break;
490
491 case PTRACE_SINGLESTEP: /* set the trap flag. */
492 ret = -EIO;
493 if (!valid_signal(data))
494 break;
495 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
496 set_singlestep(child);
497 child->exit_code = data;
498 /* give it a chance to run. */
499 wake_up_process(child);
500 ret = 0;
501 break;
502
503 case PTRACE_GETREGS: { /* Get all gp regs from the child. */
504 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
505 sizeof(struct user_regs_struct))) {
506 ret = -EIO;
507 break;
508 }
509 ret = 0;
510 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
511 ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
512 data += sizeof(long);
513 }
514 break;
515 }
516
517 case PTRACE_SETREGS: { /* Set all gp regs in the child. */
518 unsigned long tmp;
519 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
520 sizeof(struct user_regs_struct))) {
521 ret = -EIO;
522 break;
523 }
524 ret = 0;
525 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
526 ret = __get_user(tmp, (unsigned long __user *) data);
527 if (ret)
528 break;
529 ret = putreg(child, ui, tmp);
530 if (ret)
531 break;
532 data += sizeof(long);
533 }
534 break;
535 }
536
537 case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
538 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
539 sizeof(struct user_i387_struct))) {
540 ret = -EIO;
541 break;
542 }
543 ret = get_fpregs((struct user_i387_struct __user *)data, child);
544 break;
545 }
546
547 case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
548 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
549 sizeof(struct user_i387_struct))) {
550 ret = -EIO;
551 break;
552 }
553 set_stopped_child_used_math(child);
554 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
555 break;
556 }
557
558 default:
559 ret = ptrace_request(child, request, addr, data);
560 break;
561 }
562 return ret;
563}
564
565static void syscall_trace(struct pt_regs *regs)
566{
567
568#if 0
569 printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
570 current->comm,
571 regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
572 current_thread_info()->flags, current->ptrace);
573#endif
574
575 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
576 ? 0x80 : 0));
577 /*
578 * this isn't the same as continuing with a signal, but it will do
579 * for normal use. strace only continues with a signal if the
580 * stopping signal is not SIGTRAP. -brl
581 */
582 if (current->exit_code) {
583 send_sig(current->exit_code, current, 1);
584 current->exit_code = 0;
585 }
586}
587
588asmlinkage void syscall_trace_enter(struct pt_regs *regs)
589{
590 /* do the secure computing check first */
591 secure_computing(regs->orig_rax);
592
593 if (test_thread_flag(TIF_SYSCALL_TRACE)
594 && (current->ptrace & PT_PTRACED))
595 syscall_trace(regs);
596
597 if (unlikely(current->audit_context)) {
598 if (test_thread_flag(TIF_IA32)) {
599 audit_syscall_entry(AUDIT_ARCH_I386,
600 regs->orig_rax,
601 regs->rbx, regs->rcx,
602 regs->rdx, regs->rsi);
603 } else {
604 audit_syscall_entry(AUDIT_ARCH_X86_64,
605 regs->orig_rax,
606 regs->rdi, regs->rsi,
607 regs->rdx, regs->r10);
608 }
609 }
610}
611
612asmlinkage void syscall_trace_leave(struct pt_regs *regs)
613{
614 if (unlikely(current->audit_context))
615 audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
616
617 if ((test_thread_flag(TIF_SYSCALL_TRACE)
618 || test_thread_flag(TIF_SINGLESTEP))
619 && (current->ptrace & PT_PTRACED))
620 syscall_trace(regs);
621}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index fab30e134836..6ba33ca8715a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -30,8 +30,8 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
31 31
32 if (!(word & (1 << 13))) { 32 if (!(word & (1 << 13))) {
33 printk(KERN_INFO "Intel E7520/7320/7525 detected. " 33 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
34 "Disabling irq balancing and affinity\n"); 34 "disabling irq balancing and affinity\n");
35#ifdef CONFIG_IRQBALANCE 35#ifdef CONFIG_IRQBALANCE
36 irqbalance_disable(""); 36 irqbalance_disable("");
37#endif 37#endif
@@ -104,14 +104,16 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
104 pci_read_config_dword(dev, 0xF0, &rcba); 104 pci_read_config_dword(dev, 0xF0, &rcba);
105 rcba &= 0xFFFFC000; 105 rcba &= 0xFFFFC000;
106 if (rcba == 0) { 106 if (rcba == 0) {
107 printk(KERN_DEBUG "RCBA disabled. Cannot force enable HPET\n"); 107 dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; "
108 "cannot force enable HPET\n");
108 return; 109 return;
109 } 110 }
110 111
111 /* use bits 31:14, 16 kB aligned */ 112 /* use bits 31:14, 16 kB aligned */
112 rcba_base = ioremap_nocache(rcba, 0x4000); 113 rcba_base = ioremap_nocache(rcba, 0x4000);
113 if (rcba_base == NULL) { 114 if (rcba_base == NULL) {
114 printk(KERN_DEBUG "ioremap failed. Cannot force enable HPET\n"); 115 dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
116 "cannot force enable HPET\n");
115 return; 117 return;
116 } 118 }
117 119
@@ -122,8 +124,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
122 /* HPET is enabled in HPTC. Just not reported by BIOS */ 124 /* HPET is enabled in HPTC. Just not reported by BIOS */
123 val = val & 0x3; 125 val = val & 0x3;
124 force_hpet_address = 0xFED00000 | (val << 12); 126 force_hpet_address = 0xFED00000 | (val << 12);
125 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 127 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
126 force_hpet_address); 128 "0x%lx\n", force_hpet_address);
127 iounmap(rcba_base); 129 iounmap(rcba_base);
128 return; 130 return;
129 } 131 }
@@ -142,11 +144,12 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
142 if (err) { 144 if (err) {
143 force_hpet_address = 0; 145 force_hpet_address = 0;
144 iounmap(rcba_base); 146 iounmap(rcba_base);
145 printk(KERN_DEBUG "Failed to force enable HPET\n"); 147 dev_printk(KERN_DEBUG, &dev->dev,
148 "Failed to force enable HPET\n");
146 } else { 149 } else {
147 force_hpet_resume_type = ICH_FORCE_HPET_RESUME; 150 force_hpet_resume_type = ICH_FORCE_HPET_RESUME;
148 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 151 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
149 force_hpet_address); 152 "0x%lx\n", force_hpet_address);
150 } 153 }
151} 154}
152 155
@@ -162,6 +165,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31,
162 ich_force_enable_hpet); 165 ich_force_enable_hpet);
163DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, 166DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1,
164 ich_force_enable_hpet); 167 ich_force_enable_hpet);
168DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
169 ich_force_enable_hpet);
165 170
166 171
167static struct pci_dev *cached_dev; 172static struct pci_dev *cached_dev;
@@ -206,8 +211,8 @@ static void old_ich_force_enable_hpet(struct pci_dev *dev)
206 if (val & 0x4) { 211 if (val & 0x4) {
207 val &= 0x3; 212 val &= 0x3;
208 force_hpet_address = 0xFED00000 | (val << 12); 213 force_hpet_address = 0xFED00000 | (val << 12);
209 printk(KERN_DEBUG "HPET at base address 0x%lx\n", 214 dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
210 force_hpet_address); 215 force_hpet_address);
211 return; 216 return;
212 } 217 }
213 218
@@ -227,14 +232,14 @@ static void old_ich_force_enable_hpet(struct pci_dev *dev)
227 /* HPET is enabled in HPTC. Just not reported by BIOS */ 232 /* HPET is enabled in HPTC. Just not reported by BIOS */
228 val &= 0x3; 233 val &= 0x3;
229 force_hpet_address = 0xFED00000 | (val << 12); 234 force_hpet_address = 0xFED00000 | (val << 12);
230 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 235 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
231 force_hpet_address); 236 "0x%lx\n", force_hpet_address);
232 cached_dev = dev; 237 cached_dev = dev;
233 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; 238 force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME;
234 return; 239 return;
235 } 240 }
236 241
237 printk(KERN_DEBUG "Failed to force enable HPET\n"); 242 dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
238} 243}
239 244
240/* 245/*
@@ -292,8 +297,8 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
292 */ 297 */
293 if (val & 0x80) { 298 if (val & 0x80) {
294 force_hpet_address = (val & ~0x3ff); 299 force_hpet_address = (val & ~0x3ff);
295 printk(KERN_DEBUG "HPET at base address 0x%lx\n", 300 dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n",
296 force_hpet_address); 301 force_hpet_address);
297 return; 302 return;
298 } 303 }
299 304
@@ -307,14 +312,14 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
307 pci_read_config_dword(dev, 0x68, &val); 312 pci_read_config_dword(dev, 0x68, &val);
308 if (val & 0x80) { 313 if (val & 0x80) {
309 force_hpet_address = (val & ~0x3ff); 314 force_hpet_address = (val & ~0x3ff);
310 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 315 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
311 force_hpet_address); 316 "0x%lx\n", force_hpet_address);
312 cached_dev = dev; 317 cached_dev = dev;
313 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; 318 force_hpet_resume_type = VT8237_FORCE_HPET_RESUME;
314 return; 319 return;
315 } 320 }
316 321
317 printk(KERN_DEBUG "Failed to force enable HPET\n"); 322 dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n");
318} 323}
319 324
320DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, 325DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
@@ -342,7 +347,7 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
342 pci_read_config_dword(dev, 0x44, &val); 347 pci_read_config_dword(dev, 0x44, &val);
343 force_hpet_address = val & 0xfffffffe; 348 force_hpet_address = val & 0xfffffffe;
344 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; 349 force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME;
345 printk(KERN_DEBUG "Force enabled HPET at base address 0x%lx\n", 350 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
346 force_hpet_address); 351 force_hpet_address);
347 cached_dev = dev; 352 cached_dev = dev;
348 return; 353 return;
@@ -375,19 +380,19 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367,
375void force_hpet_resume(void) 380void force_hpet_resume(void)
376{ 381{
377 switch (force_hpet_resume_type) { 382 switch (force_hpet_resume_type) {
378 case ICH_FORCE_HPET_RESUME: 383 case ICH_FORCE_HPET_RESUME:
379 return ich_force_hpet_resume(); 384 ich_force_hpet_resume();
380 385 return;
381 case OLD_ICH_FORCE_HPET_RESUME: 386 case OLD_ICH_FORCE_HPET_RESUME:
382 return old_ich_force_hpet_resume(); 387 old_ich_force_hpet_resume();
383 388 return;
384 case VT8237_FORCE_HPET_RESUME: 389 case VT8237_FORCE_HPET_RESUME:
385 return vt8237_force_hpet_resume(); 390 vt8237_force_hpet_resume();
386 391 return;
387 case NVIDIA_FORCE_HPET_RESUME: 392 case NVIDIA_FORCE_HPET_RESUME:
388 return nvidia_force_hpet_resume(); 393 nvidia_force_hpet_resume();
389 394 return;
390 default: 395 default:
391 break; 396 break;
392 } 397 }
393} 398}
diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot.c
index bb1a0f889c5e..5818dc28167d 100644
--- a/arch/x86/kernel/reboot_32.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,64 +1,94 @@
1#include <linux/mm.h>
2#include <linux/module.h> 1#include <linux/module.h>
3#include <linux/delay.h>
4#include <linux/init.h> 2#include <linux/init.h>
5#include <linux/interrupt.h>
6#include <linux/mc146818rtc.h>
7#include <linux/efi.h>
8#include <linux/dmi.h>
9#include <linux/ctype.h>
10#include <linux/pm.h>
11#include <linux/reboot.h> 3#include <linux/reboot.h>
12#include <asm/uaccess.h> 4#include <linux/init.h>
5#include <linux/pm.h>
6#include <linux/efi.h>
7#include <acpi/reboot.h>
8#include <asm/io.h>
13#include <asm/apic.h> 9#include <asm/apic.h>
14#include <asm/hpet.h>
15#include <asm/desc.h> 10#include <asm/desc.h>
16#include "mach_reboot.h" 11#include <asm/hpet.h>
17#include <asm/reboot_fixups.h> 12#include <asm/reboot_fixups.h>
18#include <asm/reboot.h> 13#include <asm/reboot.h>
19 14
15#ifdef CONFIG_X86_32
16# include <linux/dmi.h>
17# include <linux/ctype.h>
18# include <linux/mc146818rtc.h>
19# include <asm/pgtable.h>
20#else
21# include <asm/iommu.h>
22#endif
23
20/* 24/*
21 * Power off function, if any 25 * Power off function, if any
22 */ 26 */
23void (*pm_power_off)(void); 27void (*pm_power_off)(void);
24EXPORT_SYMBOL(pm_power_off); 28EXPORT_SYMBOL(pm_power_off);
25 29
30static long no_idt[3];
26static int reboot_mode; 31static int reboot_mode;
27static int reboot_thru_bios; 32enum reboot_type reboot_type = BOOT_KBD;
33int reboot_force;
28 34
29#ifdef CONFIG_SMP 35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
30static int reboot_cpu = -1; 36static int reboot_cpu = -1;
31#endif 37#endif
38
39/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
40 warm Don't set the cold reboot flag
41 cold Set the cold reboot flag
42 bios Reboot by jumping through the BIOS (only for X86_32)
43 smp Reboot by executing reset on BSP or other CPU (only for X86_32)
44 triple Force a triple fault (init)
45 kbd Use the keyboard controller. cold reset (default)
46 acpi Use the RESET_REG in the FADT
47 efi Use efi reset_system runtime service
48 force Avoid anything that could hang.
49 */
32static int __init reboot_setup(char *str) 50static int __init reboot_setup(char *str)
33{ 51{
34 while(1) { 52 for (;;) {
35 switch (*str) { 53 switch (*str) {
36 case 'w': /* "warm" reboot (no memory testing etc) */ 54 case 'w':
37 reboot_mode = 0x1234; 55 reboot_mode = 0x1234;
38 break; 56 break;
39 case 'c': /* "cold" reboot (with memory testing etc) */ 57
40 reboot_mode = 0x0; 58 case 'c':
41 break; 59 reboot_mode = 0;
42 case 'b': /* "bios" reboot by jumping through the BIOS */
43 reboot_thru_bios = 1;
44 break;
45 case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */
46 reboot_thru_bios = 0;
47 break; 60 break;
61
62#ifdef CONFIG_X86_32
48#ifdef CONFIG_SMP 63#ifdef CONFIG_SMP
49 case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ 64 case 's':
50 if (isdigit(*(str+1))) { 65 if (isdigit(*(str+1))) {
51 reboot_cpu = (int) (*(str+1) - '0'); 66 reboot_cpu = (int) (*(str+1) - '0');
52 if (isdigit(*(str+2))) 67 if (isdigit(*(str+2)))
53 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); 68 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
54 } 69 }
55 /* we will leave sorting out the final value 70 /* we will leave sorting out the final value
56 when we are ready to reboot, since we might not 71 when we are ready to reboot, since we might not
57 have set up boot_cpu_id or smp_num_cpu */ 72 have set up boot_cpu_id or smp_num_cpu */
58 break; 73 break;
74#endif /* CONFIG_SMP */
75
76 case 'b':
59#endif 77#endif
78 case 'a':
79 case 'k':
80 case 't':
81 case 'e':
82 reboot_type = *str;
83 break;
84
85 case 'f':
86 reboot_force = 1;
87 break;
60 } 88 }
61 if((str = strchr(str,',')) != NULL) 89
90 str = strchr(str, ',');
91 if (str)
62 str++; 92 str++;
63 else 93 else
64 break; 94 break;
@@ -68,18 +98,21 @@ static int __init reboot_setup(char *str)
68 98
69__setup("reboot=", reboot_setup); 99__setup("reboot=", reboot_setup);
70 100
101
102#ifdef CONFIG_X86_32
71/* 103/*
72 * Reboot options and system auto-detection code provided by 104 * Reboot options and system auto-detection code provided by
73 * Dell Inc. so their systems "just work". :-) 105 * Dell Inc. so their systems "just work". :-)
74 */ 106 */
75 107
76/* 108/*
77 * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. 109 * Some machines require the "reboot=b" commandline option,
110 * this quirk makes that automatic.
78 */ 111 */
79static int __init set_bios_reboot(const struct dmi_system_id *d) 112static int __init set_bios_reboot(const struct dmi_system_id *d)
80{ 113{
81 if (!reboot_thru_bios) { 114 if (reboot_type != BOOT_BIOS) {
82 reboot_thru_bios = 1; 115 reboot_type = BOOT_BIOS;
83 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); 116 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
84 } 117 }
85 return 0; 118 return 0;
@@ -143,7 +176,6 @@ static int __init reboot_init(void)
143 dmi_check_system(reboot_dmi_table); 176 dmi_check_system(reboot_dmi_table);
144 return 0; 177 return 0;
145} 178}
146
147core_initcall(reboot_init); 179core_initcall(reboot_init);
148 180
149/* The following code and data reboots the machine by switching to real 181/* The following code and data reboots the machine by switching to real
@@ -152,7 +184,6 @@ core_initcall(reboot_init);
152 controller to pulse the CPU reset line, which is more thorough, but 184 controller to pulse the CPU reset line, which is more thorough, but
153 doesn't work with at least one type of 486 motherboard. It is easy 185 doesn't work with at least one type of 486 motherboard. It is easy
154 to stop this code working; hence the copious comments. */ 186 to stop this code working; hence the copious comments. */
155
156static unsigned long long 187static unsigned long long
157real_mode_gdt_entries [3] = 188real_mode_gdt_entries [3] =
158{ 189{
@@ -161,11 +192,9 @@ real_mode_gdt_entries [3] =
161 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ 192 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
162}; 193};
163 194
164static struct Xgt_desc_struct 195static struct desc_ptr
165real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, 196real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
166real_mode_idt = { 0x3ff, 0 }, 197real_mode_idt = { 0x3ff, 0 };
167no_idt = { 0, 0 };
168
169 198
170/* This is 16-bit protected mode code to disable paging and the cache, 199/* This is 16-bit protected mode code to disable paging and the cache,
171 switch to real mode and jump to the BIOS reset code. 200 switch to real mode and jump to the BIOS reset code.
@@ -185,7 +214,6 @@ no_idt = { 0, 0 };
185 214
186 More could be done here to set up the registers as if a CPU reset had 215 More could be done here to set up the registers as if a CPU reset had
187 occurred; hopefully real BIOSs don't assume much. */ 216 occurred; hopefully real BIOSs don't assume much. */
188
189static unsigned char real_mode_switch [] = 217static unsigned char real_mode_switch [] =
190{ 218{
191 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 219 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
@@ -223,7 +251,6 @@ void machine_real_restart(unsigned char *code, int length)
223 `outb_p' is needed instead of just `outb'. Use it to be on the 251 `outb_p' is needed instead of just `outb'. Use it to be on the
224 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) 252 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
225 */ 253 */
226
227 spin_lock(&rtc_lock); 254 spin_lock(&rtc_lock);
228 CMOS_WRITE(0x00, 0x8f); 255 CMOS_WRITE(0x00, 0x8f);
229 spin_unlock(&rtc_lock); 256 spin_unlock(&rtc_lock);
@@ -231,9 +258,8 @@ void machine_real_restart(unsigned char *code, int length)
231 /* Remap the kernel at virtual address zero, as well as offset zero 258 /* Remap the kernel at virtual address zero, as well as offset zero
232 from the kernel segment. This assumes the kernel segment starts at 259 from the kernel segment. This assumes the kernel segment starts at
233 virtual address PAGE_OFFSET. */ 260 virtual address PAGE_OFFSET. */
234 261 memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
235 memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 262 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
236 sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
237 263
238 /* 264 /*
239 * Use `swapper_pg_dir' as our page directory. 265 * Use `swapper_pg_dir' as our page directory.
@@ -245,7 +271,6 @@ void machine_real_restart(unsigned char *code, int length)
245 boot)". This seems like a fairly standard thing that gets set by 271 boot)". This seems like a fairly standard thing that gets set by
246 REBOOT.COM programs, and the previous reset routine did this 272 REBOOT.COM programs, and the previous reset routine did this
247 too. */ 273 too. */
248
249 *((unsigned short *)0x472) = reboot_mode; 274 *((unsigned short *)0x472) = reboot_mode;
250 275
251 /* For the switch to real mode, copy some code to low memory. It has 276 /* For the switch to real mode, copy some code to low memory. It has
@@ -253,19 +278,16 @@ void machine_real_restart(unsigned char *code, int length)
253 has to have the same physical and virtual address, because it turns 278 has to have the same physical and virtual address, because it turns
254 off paging. Copy it near the end of the first page, out of the way 279 off paging. Copy it near the end of the first page, out of the way
255 of BIOS variables. */ 280 of BIOS variables. */
256 281 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100),
257 memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100),
258 real_mode_switch, sizeof (real_mode_switch)); 282 real_mode_switch, sizeof (real_mode_switch));
259 memcpy ((void *) (0x1000 - 100), code, length); 283 memcpy((void *)(0x1000 - 100), code, length);
260 284
261 /* Set up the IDT for real mode. */ 285 /* Set up the IDT for real mode. */
262
263 load_idt(&real_mode_idt); 286 load_idt(&real_mode_idt);
264 287
265 /* Set up a GDT from which we can load segment descriptors for real 288 /* Set up a GDT from which we can load segment descriptors for real
266 mode. The GDT is not used in real mode; it is just needed here to 289 mode. The GDT is not used in real mode; it is just needed here to
267 prepare the descriptors. */ 290 prepare the descriptors. */
268
269 load_gdt(&real_mode_gdt); 291 load_gdt(&real_mode_gdt);
270 292
271 /* Load the data segment registers, and thus the descriptors ready for 293 /* Load the data segment registers, and thus the descriptors ready for
@@ -273,7 +295,6 @@ void machine_real_restart(unsigned char *code, int length)
273 selector value being loaded here. This is so that the segment 295 selector value being loaded here. This is so that the segment
274 registers don't have to be reloaded after switching to real mode: 296 registers don't have to be reloaded after switching to real mode:
275 the values are consistent for real mode operation already. */ 297 the values are consistent for real mode operation already. */
276
277 __asm__ __volatile__ ("movl $0x0010,%%eax\n" 298 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
278 "\tmovl %%eax,%%ds\n" 299 "\tmovl %%eax,%%ds\n"
279 "\tmovl %%eax,%%es\n" 300 "\tmovl %%eax,%%es\n"
@@ -284,130 +305,147 @@ void machine_real_restart(unsigned char *code, int length)
284 /* Jump to the 16-bit code that we copied earlier. It disables paging 305 /* Jump to the 16-bit code that we copied earlier. It disables paging
285 and the cache, switches to real mode, and jumps to the BIOS reset 306 and the cache, switches to real mode, and jumps to the BIOS reset
286 entry point. */ 307 entry point. */
287
288 __asm__ __volatile__ ("ljmp $0x0008,%0" 308 __asm__ __volatile__ ("ljmp $0x0008,%0"
289 : 309 :
290 : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); 310 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
291} 311}
292#ifdef CONFIG_APM_MODULE 312#ifdef CONFIG_APM_MODULE
293EXPORT_SYMBOL(machine_real_restart); 313EXPORT_SYMBOL(machine_real_restart);
294#endif 314#endif
295 315
296static void native_machine_shutdown(void) 316#endif /* CONFIG_X86_32 */
317
318static inline void kb_wait(void)
319{
320 int i;
321
322 for (i = 0; i < 0x10000; i++) {
323 if ((inb(0x64) & 0x02) == 0)
324 break;
325 udelay(2);
326 }
327}
328
329void machine_emergency_restart(void)
330{
331 int i;
332
333 /* Tell the BIOS if we want cold or warm reboot */
334 *((unsigned short *)__va(0x472)) = reboot_mode;
335
336 for (;;) {
337 /* Could also try the reset bit in the Hammer NB */
338 switch (reboot_type) {
339 case BOOT_KBD:
340 for (i = 0; i < 10; i++) {
341 kb_wait();
342 udelay(50);
343 outb(0xfe, 0x64); /* pulse reset low */
344 udelay(50);
345 }
346
347 case BOOT_TRIPLE:
348 load_idt((const struct desc_ptr *)&no_idt);
349 __asm__ __volatile__("int3");
350
351 reboot_type = BOOT_KBD;
352 break;
353
354#ifdef CONFIG_X86_32
355 case BOOT_BIOS:
356 machine_real_restart(jump_to_bios, sizeof(jump_to_bios));
357
358 reboot_type = BOOT_KBD;
359 break;
360#endif
361
362 case BOOT_ACPI:
363 acpi_reboot();
364 reboot_type = BOOT_KBD;
365 break;
366
367
368 case BOOT_EFI:
369 if (efi_enabled)
370 efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
371 EFI_SUCCESS, 0, NULL);
372
373 reboot_type = BOOT_KBD;
374 break;
375 }
376 }
377}
378
379void machine_shutdown(void)
297{ 380{
381 /* Stop the cpus and apics */
298#ifdef CONFIG_SMP 382#ifdef CONFIG_SMP
299 int reboot_cpu_id; 383 int reboot_cpu_id;
300 384
301 /* The boot cpu is always logical cpu 0 */ 385 /* The boot cpu is always logical cpu 0 */
302 reboot_cpu_id = 0; 386 reboot_cpu_id = 0;
303 387
388#ifdef CONFIG_X86_32
304 /* See if there has been given a command line override */ 389 /* See if there has been given a command line override */
305 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && 390 if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
306 cpu_isset(reboot_cpu, cpu_online_map)) { 391 cpu_isset(reboot_cpu, cpu_online_map))
307 reboot_cpu_id = reboot_cpu; 392 reboot_cpu_id = reboot_cpu;
308 } 393#endif
309 394
310 /* Make certain the cpu I'm rebooting on is online */ 395 /* Make certain the cpu I'm about to reboot on is online */
311 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { 396 if (!cpu_isset(reboot_cpu_id, cpu_online_map))
312 reboot_cpu_id = smp_processor_id(); 397 reboot_cpu_id = smp_processor_id();
313 }
314 398
315 /* Make certain I only run on the appropriate processor */ 399 /* Make certain I only run on the appropriate processor */
316 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); 400 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
317 401
318 /* O.K. Now that I'm on the appropriate processor, stop 402 /* O.K Now that I'm on the appropriate processor,
319 * all of the others, and disable their local APICs. 403 * stop all of the others.
320 */ 404 */
321
322 smp_send_stop(); 405 smp_send_stop();
323#endif /* CONFIG_SMP */ 406#endif
324 407
325 lapic_shutdown(); 408 lapic_shutdown();
326 409
327#ifdef CONFIG_X86_IO_APIC 410#ifdef CONFIG_X86_IO_APIC
328 disable_IO_APIC(); 411 disable_IO_APIC();
329#endif 412#endif
413
330#ifdef CONFIG_HPET_TIMER 414#ifdef CONFIG_HPET_TIMER
331 hpet_disable(); 415 hpet_disable();
332#endif 416#endif
333}
334 417
335void __attribute__((weak)) mach_reboot_fixups(void) 418#ifdef CONFIG_X86_64
336{ 419 pci_iommu_shutdown();
420#endif
337} 421}
338 422
339static void native_machine_emergency_restart(void) 423void machine_restart(char *__unused)
340{ 424{
341 if (!reboot_thru_bios) { 425 printk("machine restart\n");
342 if (efi_enabled) {
343 efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
344 load_idt(&no_idt);
345 __asm__ __volatile__("int3");
346 }
347 /* rebooting needs to touch the page at absolute addr 0 */
348 *((unsigned short *)__va(0x472)) = reboot_mode;
349 for (;;) {
350 mach_reboot_fixups(); /* for board specific fixups */
351 mach_reboot();
352 /* That didn't work - force a triple fault.. */
353 load_idt(&no_idt);
354 __asm__ __volatile__("int3");
355 }
356 }
357 if (efi_enabled)
358 efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL);
359 426
360 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 427 if (!reboot_force)
361} 428 machine_shutdown();
362
363static void native_machine_restart(char * __unused)
364{
365 machine_shutdown();
366 machine_emergency_restart(); 429 machine_emergency_restart();
367} 430}
368 431
369static void native_machine_halt(void) 432void machine_halt(void)
370{ 433{
371} 434}
372 435
373static void native_machine_power_off(void) 436void machine_power_off(void)
374{ 437{
375 if (pm_power_off) { 438 if (pm_power_off) {
376 machine_shutdown(); 439 if (!reboot_force)
440 machine_shutdown();
377 pm_power_off(); 441 pm_power_off();
378 } 442 }
379} 443}
380 444
381
382struct machine_ops machine_ops = { 445struct machine_ops machine_ops = {
383 .power_off = native_machine_power_off, 446 .power_off = machine_power_off,
384 .shutdown = native_machine_shutdown, 447 .shutdown = machine_shutdown,
385 .emergency_restart = native_machine_emergency_restart, 448 .emergency_restart = machine_emergency_restart,
386 .restart = native_machine_restart, 449 .restart = machine_restart,
387 .halt = native_machine_halt, 450 .halt = machine_halt
388}; 451};
389
390void machine_power_off(void)
391{
392 machine_ops.power_off();
393}
394
395void machine_shutdown(void)
396{
397 machine_ops.shutdown();
398}
399
400void machine_emergency_restart(void)
401{
402 machine_ops.emergency_restart();
403}
404
405void machine_restart(char *cmd)
406{
407 machine_ops.restart(cmd);
408}
409
410void machine_halt(void)
411{
412 machine_ops.halt();
413}
diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c
deleted file mode 100644
index 53620a92a8fd..000000000000
--- a/arch/x86/kernel/reboot_64.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/* Various gunk just to reboot the machine. */
2#include <linux/module.h>
3#include <linux/reboot.h>
4#include <linux/init.h>
5#include <linux/smp.h>
6#include <linux/kernel.h>
7#include <linux/ctype.h>
8#include <linux/string.h>
9#include <linux/pm.h>
10#include <linux/kdebug.h>
11#include <linux/sched.h>
12#include <asm/io.h>
13#include <asm/delay.h>
14#include <asm/desc.h>
15#include <asm/hw_irq.h>
16#include <asm/system.h>
17#include <asm/pgtable.h>
18#include <asm/tlbflush.h>
19#include <asm/apic.h>
20#include <asm/hpet.h>
21#include <asm/gart.h>
22
23/*
24 * Power off function, if any
25 */
26void (*pm_power_off)(void);
27EXPORT_SYMBOL(pm_power_off);
28
29static long no_idt[3];
30static enum {
31 BOOT_TRIPLE = 't',
32 BOOT_KBD = 'k'
33} reboot_type = BOOT_KBD;
34static int reboot_mode = 0;
35int reboot_force;
36
37/* reboot=t[riple] | k[bd] [, [w]arm | [c]old]
38 warm Don't set the cold reboot flag
39 cold Set the cold reboot flag
40 triple Force a triple fault (init)
41 kbd Use the keyboard controller. cold reset (default)
42 force Avoid anything that could hang.
43 */
44static int __init reboot_setup(char *str)
45{
46 for (;;) {
47 switch (*str) {
48 case 'w':
49 reboot_mode = 0x1234;
50 break;
51
52 case 'c':
53 reboot_mode = 0;
54 break;
55
56 case 't':
57 case 'b':
58 case 'k':
59 reboot_type = *str;
60 break;
61 case 'f':
62 reboot_force = 1;
63 break;
64 }
65 if((str = strchr(str,',')) != NULL)
66 str++;
67 else
68 break;
69 }
70 return 1;
71}
72
73__setup("reboot=", reboot_setup);
74
75static inline void kb_wait(void)
76{
77 int i;
78
79 for (i=0; i<0x10000; i++)
80 if ((inb_p(0x64) & 0x02) == 0)
81 break;
82}
83
84void machine_shutdown(void)
85{
86 unsigned long flags;
87
88 /* Stop the cpus and apics */
89#ifdef CONFIG_SMP
90 int reboot_cpu_id;
91
92 /* The boot cpu is always logical cpu 0 */
93 reboot_cpu_id = 0;
94
95 /* Make certain the cpu I'm about to reboot on is online */
96 if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
97 reboot_cpu_id = smp_processor_id();
98 }
99
100 /* Make certain I only run on the appropriate processor */
101 set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
102
103 /* O.K Now that I'm on the appropriate processor,
104 * stop all of the others.
105 */
106 smp_send_stop();
107#endif
108
109 local_irq_save(flags);
110
111#ifndef CONFIG_SMP
112 disable_local_APIC();
113#endif
114
115 disable_IO_APIC();
116
117#ifdef CONFIG_HPET_TIMER
118 hpet_disable();
119#endif
120 local_irq_restore(flags);
121
122 pci_iommu_shutdown();
123}
124
125void machine_emergency_restart(void)
126{
127 int i;
128
129 /* Tell the BIOS if we want cold or warm reboot */
130 *((unsigned short *)__va(0x472)) = reboot_mode;
131
132 for (;;) {
133 /* Could also try the reset bit in the Hammer NB */
134 switch (reboot_type) {
135 case BOOT_KBD:
136 for (i=0; i<10; i++) {
137 kb_wait();
138 udelay(50);
139 outb(0xfe,0x64); /* pulse reset low */
140 udelay(50);
141 }
142
143 case BOOT_TRIPLE:
144 load_idt((const struct desc_ptr *)&no_idt);
145 __asm__ __volatile__("int3");
146
147 reboot_type = BOOT_KBD;
148 break;
149 }
150 }
151}
152
153void machine_restart(char * __unused)
154{
155 printk("machine restart\n");
156
157 if (!reboot_force) {
158 machine_shutdown();
159 }
160 machine_emergency_restart();
161}
162
163void machine_halt(void)
164{
165}
166
167void machine_power_off(void)
168{
169 if (pm_power_off) {
170 if (!reboot_force) {
171 machine_shutdown();
172 }
173 pm_power_off();
174 }
175}
176
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index f452726c0fe2..dec0b5ec25c2 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev)
30 udelay(50); /* shouldn't get here but be safe and spin a while */ 30 udelay(50); /* shouldn't get here but be safe and spin a while */
31} 31}
32 32
33static void rdc321x_reset(struct pci_dev *dev)
34{
35 unsigned i;
36 /* Voluntary reset the watchdog timer */
37 outl(0x80003840, 0xCF8);
38 /* Generate a CPU reset on next tick */
39 i = inl(0xCFC);
40 /* Use the minimum timer resolution */
41 i |= 0x1600;
42 outl(i, 0xCFC);
43 outb(1, 0x92);
44}
45
33struct device_fixup { 46struct device_fixup {
34 unsigned int vendor; 47 unsigned int vendor;
35 unsigned int device; 48 unsigned int device;
@@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = {
40{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
41{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
42{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
56{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
43}; 57};
44 58
45/* 59/*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
new file mode 100644
index 000000000000..eb9b1a198f5e
--- /dev/null
+++ b/arch/x86/kernel/rtc.c
@@ -0,0 +1,204 @@
1/*
2 * RTC related functions
3 */
4#include <linux/acpi.h>
5#include <linux/bcd.h>
6#include <linux/mc146818rtc.h>
7
8#include <asm/time.h>
9#include <asm/vsyscall.h>
10
11#ifdef CONFIG_X86_32
12# define CMOS_YEARS_OFFS 1900
13/*
14 * This is a special lock that is owned by the CPU and holds the index
15 * register we are working with. It is required for NMI access to the
16 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
17 */
18volatile unsigned long cmos_lock = 0;
19EXPORT_SYMBOL(cmos_lock);
20#else
21/*
22 * x86-64 systems only exists since 2002.
23 * This will work up to Dec 31, 2100
24 */
25# define CMOS_YEARS_OFFS 2000
26#endif
27
28DEFINE_SPINLOCK(rtc_lock);
29EXPORT_SYMBOL(rtc_lock);
30
31/*
32 * In order to set the CMOS clock precisely, set_rtc_mmss has to be
33 * called 500 ms after the second nowtime has started, because when
34 * nowtime is written into the registers of the CMOS clock, it will
35 * jump to the next second precisely 500 ms later. Check the Motorola
36 * MC146818A or Dallas DS12887 data sheet for details.
37 *
38 * BUG: This routine does not handle hour overflow properly; it just
39 * sets the minutes. Usually you'll only notice that after reboot!
40 */
41int mach_set_rtc_mmss(unsigned long nowtime)
42{
43 int retval = 0;
44 int real_seconds, real_minutes, cmos_minutes;
45 unsigned char save_control, save_freq_select;
46
47 /* tell the clock it's being set */
48 save_control = CMOS_READ(RTC_CONTROL);
49 CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
50
51 /* stop and reset prescaler */
52 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
53 CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
54
55 cmos_minutes = CMOS_READ(RTC_MINUTES);
56 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
57 BCD_TO_BIN(cmos_minutes);
58
59 /*
60 * since we're only adjusting minutes and seconds,
61 * don't interfere with hour overflow. This avoids
62 * messing with unknown time zones but requires your
63 * RTC not to be off by more than 15 minutes
64 */
65 real_seconds = nowtime % 60;
66 real_minutes = nowtime / 60;
67 /* correct for half hour time zone */
68 if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1)
69 real_minutes += 30;
70 real_minutes %= 60;
71
72 if (abs(real_minutes - cmos_minutes) < 30) {
73 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
74 BIN_TO_BCD(real_seconds);
75 BIN_TO_BCD(real_minutes);
76 }
77 CMOS_WRITE(real_seconds,RTC_SECONDS);
78 CMOS_WRITE(real_minutes,RTC_MINUTES);
79 } else {
80 printk(KERN_WARNING
81 "set_rtc_mmss: can't update from %d to %d\n",
82 cmos_minutes, real_minutes);
83 retval = -1;
84 }
85
86 /* The following flags have to be released exactly in this order,
87 * otherwise the DS12887 (popular MC146818A clone with integrated
88 * battery and quartz) will not reset the oscillator and will not
89 * update precisely 500 ms later. You won't find this mentioned in
90 * the Dallas Semiconductor data sheets, but who believes data
91 * sheets anyway ... -- Markus Kuhn
92 */
93 CMOS_WRITE(save_control, RTC_CONTROL);
94 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
95
96 return retval;
97}
98
99unsigned long mach_get_cmos_time(void)
100{
101 unsigned int year, mon, day, hour, min, sec, century = 0;
102
103 /*
104 * If UIP is clear, then we have >= 244 microseconds before
105 * RTC registers will be updated. Spec sheet says that this
106 * is the reliable way to read RTC - registers. If UIP is set
107 * then the register access might be invalid.
108 */
109 while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
110 cpu_relax();
111
112 sec = CMOS_READ(RTC_SECONDS);
113 min = CMOS_READ(RTC_MINUTES);
114 hour = CMOS_READ(RTC_HOURS);
115 day = CMOS_READ(RTC_DAY_OF_MONTH);
116 mon = CMOS_READ(RTC_MONTH);
117 year = CMOS_READ(RTC_YEAR);
118
119#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64)
120 /* CHECKME: Is this really 64bit only ??? */
121 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
122 acpi_gbl_FADT.century)
123 century = CMOS_READ(acpi_gbl_FADT.century);
124#endif
125
126 if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) {
127 BCD_TO_BIN(sec);
128 BCD_TO_BIN(min);
129 BCD_TO_BIN(hour);
130 BCD_TO_BIN(day);
131 BCD_TO_BIN(mon);
132 BCD_TO_BIN(year);
133 }
134
135 if (century) {
136 BCD_TO_BIN(century);
137 year += century * 100;
138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
139 } else {
140 year += CMOS_YEARS_OFFS;
141 if (year < 1970)
142 year += 100;
143 }
144
145 return mktime(year, mon, day, hour, min, sec);
146}
147
148/* Routines for accessing the CMOS RAM/RTC. */
149unsigned char rtc_cmos_read(unsigned char addr)
150{
151 unsigned char val;
152
153 lock_cmos_prefix(addr);
154 outb_p(addr, RTC_PORT(0));
155 val = inb_p(RTC_PORT(1));
156 lock_cmos_suffix(addr);
157 return val;
158}
159EXPORT_SYMBOL(rtc_cmos_read);
160
161void rtc_cmos_write(unsigned char val, unsigned char addr)
162{
163 lock_cmos_prefix(addr);
164 outb_p(addr, RTC_PORT(0));
165 outb_p(val, RTC_PORT(1));
166 lock_cmos_suffix(addr);
167}
168EXPORT_SYMBOL(rtc_cmos_write);
169
170static int set_rtc_mmss(unsigned long nowtime)
171{
172 int retval;
173 unsigned long flags;
174
175 spin_lock_irqsave(&rtc_lock, flags);
176 retval = set_wallclock(nowtime);
177 spin_unlock_irqrestore(&rtc_lock, flags);
178
179 return retval;
180}
181
182/* not static: needed by APM */
183unsigned long read_persistent_clock(void)
184{
185 unsigned long retval, flags;
186
187 spin_lock_irqsave(&rtc_lock, flags);
188 retval = get_wallclock();
189 spin_unlock_irqrestore(&rtc_lock, flags);
190
191 return retval;
192}
193
194int update_persistent_clock(struct timespec now)
195{
196 return set_rtc_mmss(now.tv_sec);
197}
198
199unsigned long long native_read_tsc(void)
200{
201 return __native_read_tsc();
202}
203EXPORT_SYMBOL(native_read_tsc);
204
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
index 87bc159d29df..7e004acbe526 100644
--- a/arch/x86/kernel/scx200_32.c
+++ b/arch/x86/kernel/scx200_32.c
@@ -65,7 +65,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
65 base = pci_resource_start(pdev, 0); 65 base = pci_resource_start(pdev, 0);
66 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); 66 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
67 67
68 if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) { 68 if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) {
69 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); 69 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
70 return -EBUSY; 70 return -EBUSY;
71 } 71 }
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 3558ac78c926..309366f8f603 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -24,7 +24,11 @@
24#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/setup.h> 25#include <asm/setup.h>
26 26
27#ifndef CONFIG_DEBUG_BOOT_PARAMS
27struct boot_params __initdata boot_params; 28struct boot_params __initdata boot_params;
29#else
30struct boot_params boot_params;
31#endif
28 32
29cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 33cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
30 34
@@ -37,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
37char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); 41char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
38 42
39unsigned long __supported_pte_mask __read_mostly = ~0UL; 43unsigned long __supported_pte_mask __read_mostly = ~0UL;
44EXPORT_SYMBOL_GPL(__supported_pte_mask);
45
40static int do_not_nx __cpuinitdata = 0; 46static int do_not_nx __cpuinitdata = 0;
41 47
42/* noexec=on|off 48/* noexec=on|off
@@ -80,6 +86,43 @@ static int __init nonx32_setup(char *str)
80__setup("noexec32=", nonx32_setup); 86__setup("noexec32=", nonx32_setup);
81 87
82/* 88/*
89 * Copy data used in early init routines from the initial arrays to the
90 * per cpu data areas. These arrays then become expendable and the
91 * *_early_ptr's are zeroed indicating that the static arrays are gone.
92 */
93static void __init setup_per_cpu_maps(void)
94{
95 int cpu;
96
97 for_each_possible_cpu(cpu) {
98#ifdef CONFIG_SMP
99 if (per_cpu_offset(cpu)) {
100#endif
101 per_cpu(x86_cpu_to_apicid, cpu) =
102 x86_cpu_to_apicid_init[cpu];
103 per_cpu(x86_bios_cpu_apicid, cpu) =
104 x86_bios_cpu_apicid_init[cpu];
105#ifdef CONFIG_NUMA
106 per_cpu(x86_cpu_to_node_map, cpu) =
107 x86_cpu_to_node_map_init[cpu];
108#endif
109#ifdef CONFIG_SMP
110 }
111 else
112 printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
113 cpu);
114#endif
115 }
116
117 /* indicate the early static arrays will soon be gone */
118 x86_cpu_to_apicid_early_ptr = NULL;
119 x86_bios_cpu_apicid_early_ptr = NULL;
120#ifdef CONFIG_NUMA
121 x86_cpu_to_node_map_early_ptr = NULL;
122#endif
123}
124
125/*
83 * Great future plan: 126 * Great future plan:
84 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 127 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
85 * Always point %gs to its beginning 128 * Always point %gs to its beginning
@@ -100,18 +143,21 @@ void __init setup_per_cpu_areas(void)
100 for_each_cpu_mask (i, cpu_possible_map) { 143 for_each_cpu_mask (i, cpu_possible_map) {
101 char *ptr; 144 char *ptr;
102 145
103 if (!NODE_DATA(cpu_to_node(i))) { 146 if (!NODE_DATA(early_cpu_to_node(i))) {
104 printk("cpu with no node %d, num_online_nodes %d\n", 147 printk("cpu with no node %d, num_online_nodes %d\n",
105 i, num_online_nodes()); 148 i, num_online_nodes());
106 ptr = alloc_bootmem_pages(size); 149 ptr = alloc_bootmem_pages(size);
107 } else { 150 } else {
108 ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); 151 ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size);
109 } 152 }
110 if (!ptr) 153 if (!ptr)
111 panic("Cannot allocate cpu data for CPU %d\n", i); 154 panic("Cannot allocate cpu data for CPU %d\n", i);
112 cpu_pda(i)->data_offset = ptr - __per_cpu_start; 155 cpu_pda(i)->data_offset = ptr - __per_cpu_start;
113 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 156 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
114 } 157 }
158
159 /* setup percpu data maps early */
160 setup_per_cpu_maps();
115} 161}
116 162
117void pda_init(int cpu) 163void pda_init(int cpu)
@@ -169,7 +215,8 @@ void syscall_init(void)
169#endif 215#endif
170 216
171 /* Flags to clear on syscall */ 217 /* Flags to clear on syscall */
172 wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 218 wrmsrl(MSR_SYSCALL_MASK,
219 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
173} 220}
174 221
175void __cpuinit check_efer(void) 222void __cpuinit check_efer(void)
@@ -227,7 +274,7 @@ void __cpuinit cpu_init (void)
227 * and set up the GDT descriptor: 274 * and set up the GDT descriptor:
228 */ 275 */
229 if (cpu) 276 if (cpu)
230 memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); 277 memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE);
231 278
232 cpu_gdt_descr[cpu].size = GDT_SIZE; 279 cpu_gdt_descr[cpu].size = GDT_SIZE;
233 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); 280 load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]);
@@ -257,10 +304,10 @@ void __cpuinit cpu_init (void)
257 v, cpu); 304 v, cpu);
258 } 305 }
259 estacks += PAGE_SIZE << order[v]; 306 estacks += PAGE_SIZE << order[v];
260 orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; 307 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
261 } 308 }
262 309
263 t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); 310 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
264 /* 311 /*
265 * <= is required because the CPU will access up to 312 * <= is required because the CPU will access up to
266 * 8 bits beyond the end of the IO permission bitmap. 313 * 8 bits beyond the end of the IO permission bitmap.
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 9c24b45b513c..62adc5f20be5 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -44,9 +44,12 @@
44#include <linux/crash_dump.h> 44#include <linux/crash_dump.h>
45#include <linux/dmi.h> 45#include <linux/dmi.h>
46#include <linux/pfn.h> 46#include <linux/pfn.h>
47#include <linux/pci.h>
48#include <linux/init_ohci1394_dma.h>
47 49
48#include <video/edid.h> 50#include <video/edid.h>
49 51
52#include <asm/mtrr.h>
50#include <asm/apic.h> 53#include <asm/apic.h>
51#include <asm/e820.h> 54#include <asm/e820.h>
52#include <asm/mpspec.h> 55#include <asm/mpspec.h>
@@ -67,14 +70,83 @@
67 address, and must not be in the .bss segment! */ 70 address, and must not be in the .bss segment! */
68unsigned long init_pg_tables_end __initdata = ~0UL; 71unsigned long init_pg_tables_end __initdata = ~0UL;
69 72
70int disable_pse __cpuinitdata = 0;
71
72/* 73/*
73 * Machine setup.. 74 * Machine setup..
74 */ 75 */
75extern struct resource code_resource; 76static struct resource data_resource = {
76extern struct resource data_resource; 77 .name = "Kernel data",
77extern struct resource bss_resource; 78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
81};
82
83static struct resource code_resource = {
84 .name = "Kernel code",
85 .start = 0,
86 .end = 0,
87 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
88};
89
90static struct resource bss_resource = {
91 .name = "Kernel bss",
92 .start = 0,
93 .end = 0,
94 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
95};
96
97static struct resource video_ram_resource = {
98 .name = "Video RAM area",
99 .start = 0xa0000,
100 .end = 0xbffff,
101 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
102};
103
104static struct resource standard_io_resources[] = { {
105 .name = "dma1",
106 .start = 0x0000,
107 .end = 0x001f,
108 .flags = IORESOURCE_BUSY | IORESOURCE_IO
109}, {
110 .name = "pic1",
111 .start = 0x0020,
112 .end = 0x0021,
113 .flags = IORESOURCE_BUSY | IORESOURCE_IO
114}, {
115 .name = "timer0",
116 .start = 0x0040,
117 .end = 0x0043,
118 .flags = IORESOURCE_BUSY | IORESOURCE_IO
119}, {
120 .name = "timer1",
121 .start = 0x0050,
122 .end = 0x0053,
123 .flags = IORESOURCE_BUSY | IORESOURCE_IO
124}, {
125 .name = "keyboard",
126 .start = 0x0060,
127 .end = 0x006f,
128 .flags = IORESOURCE_BUSY | IORESOURCE_IO
129}, {
130 .name = "dma page reg",
131 .start = 0x0080,
132 .end = 0x008f,
133 .flags = IORESOURCE_BUSY | IORESOURCE_IO
134}, {
135 .name = "pic2",
136 .start = 0x00a0,
137 .end = 0x00a1,
138 .flags = IORESOURCE_BUSY | IORESOURCE_IO
139}, {
140 .name = "dma2",
141 .start = 0x00c0,
142 .end = 0x00df,
143 .flags = IORESOURCE_BUSY | IORESOURCE_IO
144}, {
145 .name = "fpu",
146 .start = 0x00f0,
147 .end = 0x00ff,
148 .flags = IORESOURCE_BUSY | IORESOURCE_IO
149} };
78 150
79/* cpu data as detected by the assembly code in head.S */ 151/* cpu data as detected by the assembly code in head.S */
80struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 152struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -116,13 +188,17 @@ extern int root_mountflags;
116 188
117unsigned long saved_videomode; 189unsigned long saved_videomode;
118 190
119#define RAMDISK_IMAGE_START_MASK 0x07FF 191#define RAMDISK_IMAGE_START_MASK 0x07FF
120#define RAMDISK_PROMPT_FLAG 0x8000 192#define RAMDISK_PROMPT_FLAG 0x8000
121#define RAMDISK_LOAD_FLAG 0x4000 193#define RAMDISK_LOAD_FLAG 0x4000
122 194
123static char __initdata command_line[COMMAND_LINE_SIZE]; 195static char __initdata command_line[COMMAND_LINE_SIZE];
124 196
197#ifndef CONFIG_DEBUG_BOOT_PARAMS
125struct boot_params __initdata boot_params; 198struct boot_params __initdata boot_params;
199#else
200struct boot_params boot_params;
201#endif
126 202
127#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 203#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
128struct edd edd; 204struct edd edd;
@@ -166,8 +242,7 @@ static int __init parse_mem(char *arg)
166 return -EINVAL; 242 return -EINVAL;
167 243
168 if (strcmp(arg, "nopentium") == 0) { 244 if (strcmp(arg, "nopentium") == 0) {
169 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); 245 setup_clear_cpu_cap(X86_FEATURE_PSE);
170 disable_pse = 1;
171 } else { 246 } else {
172 /* If the user specifies memory size, we 247 /* If the user specifies memory size, we
173 * limit the BIOS-provided memory map to 248 * limit the BIOS-provided memory map to
@@ -176,7 +251,7 @@ static int __init parse_mem(char *arg)
176 * trim the existing memory map. 251 * trim the existing memory map.
177 */ 252 */
178 unsigned long long mem_size; 253 unsigned long long mem_size;
179 254
180 mem_size = memparse(arg, &arg); 255 mem_size = memparse(arg, &arg);
181 limit_regions(mem_size); 256 limit_regions(mem_size);
182 user_defined_memmap = 1; 257 user_defined_memmap = 1;
@@ -315,7 +390,7 @@ static void __init reserve_ebda_region(void)
315 unsigned int addr; 390 unsigned int addr;
316 addr = get_bios_ebda(); 391 addr = get_bios_ebda();
317 if (addr) 392 if (addr)
318 reserve_bootmem(addr, PAGE_SIZE); 393 reserve_bootmem(addr, PAGE_SIZE);
319} 394}
320 395
321#ifndef CONFIG_NEED_MULTIPLE_NODES 396#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -420,6 +495,100 @@ static inline void __init reserve_crashkernel(void)
420{} 495{}
421#endif 496#endif
422 497
498#ifdef CONFIG_BLK_DEV_INITRD
499
500static bool do_relocate_initrd = false;
501
502static void __init reserve_initrd(void)
503{
504 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
505 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
506 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
507 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
508 unsigned long ramdisk_here;
509
510 initrd_start = 0;
511
512 if (!boot_params.hdr.type_of_loader ||
513 !ramdisk_image || !ramdisk_size)
514 return; /* No initrd provided by bootloader */
515
516 if (ramdisk_end < ramdisk_image) {
517 printk(KERN_ERR "initrd wraps around end of memory, "
518 "disabling initrd\n");
519 return;
520 }
521 if (ramdisk_size >= end_of_lowmem/2) {
522 printk(KERN_ERR "initrd too large to handle, "
523 "disabling initrd\n");
524 return;
525 }
526 if (ramdisk_end <= end_of_lowmem) {
527 /* All in lowmem, easy case */
528 reserve_bootmem(ramdisk_image, ramdisk_size);
529 initrd_start = ramdisk_image + PAGE_OFFSET;
530 initrd_end = initrd_start+ramdisk_size;
531 return;
532 }
533
534 /* We need to move the initrd down into lowmem */
535 ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
536
537 /* Note: this includes all the lowmem currently occupied by
538 the initrd, we rely on that fact to keep the data intact. */
539 reserve_bootmem(ramdisk_here, ramdisk_size);
540 initrd_start = ramdisk_here + PAGE_OFFSET;
541 initrd_end = initrd_start + ramdisk_size;
542
543 do_relocate_initrd = true;
544}
545
546#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
547
548static void __init relocate_initrd(void)
549{
550 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
551 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
552 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
553 unsigned long ramdisk_here;
554 unsigned long slop, clen, mapaddr;
555 char *p, *q;
556
557 if (!do_relocate_initrd)
558 return;
559
560 ramdisk_here = initrd_start - PAGE_OFFSET;
561
562 q = (char *)initrd_start;
563
564 /* Copy any lowmem portion of the initrd */
565 if (ramdisk_image < end_of_lowmem) {
566 clen = end_of_lowmem - ramdisk_image;
567 p = (char *)__va(ramdisk_image);
568 memcpy(q, p, clen);
569 q += clen;
570 ramdisk_image += clen;
571 ramdisk_size -= clen;
572 }
573
574 /* Copy the highmem portion of the initrd */
575 while (ramdisk_size) {
576 slop = ramdisk_image & ~PAGE_MASK;
577 clen = ramdisk_size;
578 if (clen > MAX_MAP_CHUNK-slop)
579 clen = MAX_MAP_CHUNK-slop;
580 mapaddr = ramdisk_image & PAGE_MASK;
581 p = early_ioremap(mapaddr, clen+slop);
582 memcpy(q, p+slop, clen);
583 early_iounmap(p, clen+slop);
584 q += clen;
585 ramdisk_image += clen;
586 ramdisk_size -= clen;
587 }
588}
589
590#endif /* CONFIG_BLK_DEV_INITRD */
591
423void __init setup_bootmem_allocator(void) 592void __init setup_bootmem_allocator(void)
424{ 593{
425 unsigned long bootmap_size; 594 unsigned long bootmap_size;
@@ -475,26 +644,10 @@ void __init setup_bootmem_allocator(void)
475 */ 644 */
476 find_smp_config(); 645 find_smp_config();
477#endif 646#endif
478 numa_kva_reserve();
479#ifdef CONFIG_BLK_DEV_INITRD 647#ifdef CONFIG_BLK_DEV_INITRD
480 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 648 reserve_initrd();
481 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
482 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
483 unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
484 unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
485
486 if (ramdisk_end <= end_of_lowmem) {
487 reserve_bootmem(ramdisk_image, ramdisk_size);
488 initrd_start = ramdisk_image + PAGE_OFFSET;
489 initrd_end = initrd_start+ramdisk_size;
490 } else {
491 printk(KERN_ERR "initrd extends beyond end of memory "
492 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
493 ramdisk_end, end_of_lowmem);
494 initrd_start = 0;
495 }
496 }
497#endif 649#endif
650 numa_kva_reserve();
498 reserve_crashkernel(); 651 reserve_crashkernel();
499} 652}
500 653
@@ -545,17 +698,11 @@ void __init setup_arch(char **cmdline_p)
545 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 698 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
546 pre_setup_arch_hook(); 699 pre_setup_arch_hook();
547 early_cpu_init(); 700 early_cpu_init();
701 early_ioremap_init();
548 702
549 /*
550 * FIXME: This isn't an official loader_type right
551 * now but does currently work with elilo.
552 * If we were configured as an EFI kernel, check to make
553 * sure that we were loaded correctly from elilo and that
554 * the system table is valid. If not, then initialize normally.
555 */
556#ifdef CONFIG_EFI 703#ifdef CONFIG_EFI
557 if ((boot_params.hdr.type_of_loader == 0x50) && 704 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
558 boot_params.efi_info.efi_systab) 705 "EL32", 4))
559 efi_enabled = 1; 706 efi_enabled = 1;
560#endif 707#endif
561 708
@@ -579,12 +726,9 @@ void __init setup_arch(char **cmdline_p)
579 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); 726 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
580#endif 727#endif
581 ARCH_SETUP 728 ARCH_SETUP
582 if (efi_enabled) 729
583 efi_init(); 730 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
584 else { 731 print_memory_map(memory_setup());
585 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
586 print_memory_map(memory_setup());
587 }
588 732
589 copy_edd(); 733 copy_edd();
590 734
@@ -612,8 +756,16 @@ void __init setup_arch(char **cmdline_p)
612 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 756 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
613 *cmdline_p = command_line; 757 *cmdline_p = command_line;
614 758
759 if (efi_enabled)
760 efi_init();
761
615 max_low_pfn = setup_memory(); 762 max_low_pfn = setup_memory();
616 763
764 /* update e820 for memory not covered by WB MTRRs */
765 mtrr_bp_init();
766 if (mtrr_trim_uncached_memory(max_pfn))
767 max_low_pfn = setup_memory();
768
617#ifdef CONFIG_VMI 769#ifdef CONFIG_VMI
618 /* 770 /*
619 * Must be after max_low_pfn is determined, and before kernel 771 * Must be after max_low_pfn is determined, and before kernel
@@ -636,6 +788,16 @@ void __init setup_arch(char **cmdline_p)
636 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ 788 smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
637#endif 789#endif
638 paging_init(); 790 paging_init();
791
792 /*
793 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
794 */
795
796#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
797 if (init_ohci1394_dma_early)
798 init_ohci1394_dma_on_all_controllers();
799#endif
800
639 remapped_pgdat_init(); 801 remapped_pgdat_init();
640 sparse_init(); 802 sparse_init();
641 zone_sizes_init(); 803 zone_sizes_init();
@@ -644,15 +806,19 @@ void __init setup_arch(char **cmdline_p)
644 * NOTE: at this point the bootmem allocator is fully available. 806 * NOTE: at this point the bootmem allocator is fully available.
645 */ 807 */
646 808
809#ifdef CONFIG_BLK_DEV_INITRD
810 relocate_initrd();
811#endif
812
647 paravirt_post_allocator_init(); 813 paravirt_post_allocator_init();
648 814
649 dmi_scan_machine(); 815 dmi_scan_machine();
650 816
817 io_delay_init();
818
651#ifdef CONFIG_X86_GENERICARCH 819#ifdef CONFIG_X86_GENERICARCH
652 generic_apic_probe(); 820 generic_apic_probe();
653#endif 821#endif
654 if (efi_enabled)
655 efi_map_memmap();
656 822
657#ifdef CONFIG_ACPI 823#ifdef CONFIG_ACPI
658 /* 824 /*
@@ -661,9 +827,7 @@ void __init setup_arch(char **cmdline_p)
661 acpi_boot_table_init(); 827 acpi_boot_table_init();
662#endif 828#endif
663 829
664#ifdef CONFIG_PCI
665 early_quirks(); 830 early_quirks();
666#endif
667 831
668#ifdef CONFIG_ACPI 832#ifdef CONFIG_ACPI
669 acpi_boot_init(); 833 acpi_boot_init();
@@ -692,3 +856,26 @@ void __init setup_arch(char **cmdline_p)
692#endif 856#endif
693#endif 857#endif
694} 858}
859
860/*
861 * Request address space for all standard resources
862 *
863 * This is called just before pcibios_init(), which is also a
864 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
865 */
866static int __init request_standard_resources(void)
867{
868 int i;
869
870 printk(KERN_INFO "Setting up standard PCI resources\n");
871 init_iomem_resources(&code_resource, &data_resource, &bss_resource);
872
873 request_resource(&iomem_resource, &video_ram_resource);
874
875 /* request I/O space for devices used on all i[345]86 PCs */
876 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
877 request_resource(&ioport_resource, &standard_io_resources[i]);
878 return 0;
879}
880
881subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 30d94d1d5f5f..c8939dfddfba 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -30,6 +30,7 @@
30#include <linux/crash_dump.h> 30#include <linux/crash_dump.h>
31#include <linux/root_dev.h> 31#include <linux/root_dev.h>
32#include <linux/pci.h> 32#include <linux/pci.h>
33#include <linux/efi.h>
33#include <linux/acpi.h> 34#include <linux/acpi.h>
34#include <linux/kallsyms.h> 35#include <linux/kallsyms.h>
35#include <linux/edd.h> 36#include <linux/edd.h>
@@ -39,10 +40,13 @@
39#include <linux/dmi.h> 40#include <linux/dmi.h>
40#include <linux/dma-mapping.h> 41#include <linux/dma-mapping.h>
41#include <linux/ctype.h> 42#include <linux/ctype.h>
43#include <linux/uaccess.h>
44#include <linux/init_ohci1394_dma.h>
42 45
43#include <asm/mtrr.h> 46#include <asm/mtrr.h>
44#include <asm/uaccess.h> 47#include <asm/uaccess.h>
45#include <asm/system.h> 48#include <asm/system.h>
49#include <asm/vsyscall.h>
46#include <asm/io.h> 50#include <asm/io.h>
47#include <asm/smp.h> 51#include <asm/smp.h>
48#include <asm/msr.h> 52#include <asm/msr.h>
@@ -50,6 +54,7 @@
50#include <video/edid.h> 54#include <video/edid.h>
51#include <asm/e820.h> 55#include <asm/e820.h>
52#include <asm/dma.h> 56#include <asm/dma.h>
57#include <asm/gart.h>
53#include <asm/mpspec.h> 58#include <asm/mpspec.h>
54#include <asm/mmu_context.h> 59#include <asm/mmu_context.h>
55#include <asm/proto.h> 60#include <asm/proto.h>
@@ -59,6 +64,15 @@
59#include <asm/sections.h> 64#include <asm/sections.h>
60#include <asm/dmi.h> 65#include <asm/dmi.h>
61#include <asm/cacheflush.h> 66#include <asm/cacheflush.h>
67#include <asm/mce.h>
68#include <asm/ds.h>
69#include <asm/topology.h>
70
71#ifdef CONFIG_PARAVIRT
72#include <asm/paravirt.h>
73#else
74#define ARCH_SETUP
75#endif
62 76
63/* 77/*
64 * Machine setup.. 78 * Machine setup..
@@ -67,6 +81,8 @@
67struct cpuinfo_x86 boot_cpu_data __read_mostly; 81struct cpuinfo_x86 boot_cpu_data __read_mostly;
68EXPORT_SYMBOL(boot_cpu_data); 82EXPORT_SYMBOL(boot_cpu_data);
69 83
84__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
85
70unsigned long mmu_cr4_features; 86unsigned long mmu_cr4_features;
71 87
72/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 88/* Boot loader ID as an integer, for the benefit of proc_dointvec */
@@ -76,7 +92,7 @@ unsigned long saved_video_mode;
76 92
77int force_mwait __cpuinitdata; 93int force_mwait __cpuinitdata;
78 94
79/* 95/*
80 * Early DMI memory 96 * Early DMI memory
81 */ 97 */
82int dmi_alloc_index; 98int dmi_alloc_index;
@@ -122,25 +138,27 @@ struct resource standard_io_resources[] = {
122 138
123#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) 139#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
124 140
125struct resource data_resource = { 141static struct resource data_resource = {
126 .name = "Kernel data", 142 .name = "Kernel data",
127 .start = 0, 143 .start = 0,
128 .end = 0, 144 .end = 0,
129 .flags = IORESOURCE_RAM, 145 .flags = IORESOURCE_RAM,
130}; 146};
131struct resource code_resource = { 147static struct resource code_resource = {
132 .name = "Kernel code", 148 .name = "Kernel code",
133 .start = 0, 149 .start = 0,
134 .end = 0, 150 .end = 0,
135 .flags = IORESOURCE_RAM, 151 .flags = IORESOURCE_RAM,
136}; 152};
137struct resource bss_resource = { 153static struct resource bss_resource = {
138 .name = "Kernel bss", 154 .name = "Kernel bss",
139 .start = 0, 155 .start = 0,
140 .end = 0, 156 .end = 0,
141 .flags = IORESOURCE_RAM, 157 .flags = IORESOURCE_RAM,
142}; 158};
143 159
160static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
161
144#ifdef CONFIG_PROC_VMCORE 162#ifdef CONFIG_PROC_VMCORE
145/* elfcorehdr= specifies the location of elf core header 163/* elfcorehdr= specifies the location of elf core header
146 * stored by the crashed kernel. This option will be passed 164 * stored by the crashed kernel. This option will be passed
@@ -164,14 +182,15 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
164 unsigned long bootmap_size, bootmap; 182 unsigned long bootmap_size, bootmap;
165 183
166 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 184 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
167 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); 185 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
186 PAGE_SIZE);
168 if (bootmap == -1L) 187 if (bootmap == -1L)
169 panic("Cannot find bootmem map of size %ld\n",bootmap_size); 188 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
170 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); 189 bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
171 e820_register_active_regions(0, start_pfn, end_pfn); 190 e820_register_active_regions(0, start_pfn, end_pfn);
172 free_bootmem_with_active_regions(0, end_pfn); 191 free_bootmem_with_active_regions(0, end_pfn);
173 reserve_bootmem(bootmap, bootmap_size); 192 reserve_bootmem(bootmap, bootmap_size);
174} 193}
175#endif 194#endif
176 195
177#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 196#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
@@ -205,7 +224,8 @@ static void __init reserve_crashkernel(void)
205 unsigned long long crash_size, crash_base; 224 unsigned long long crash_size, crash_base;
206 int ret; 225 int ret;
207 226
208 free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; 227 free_mem =
228 ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT;
209 229
210 ret = parse_crashkernel(boot_command_line, free_mem, 230 ret = parse_crashkernel(boot_command_line, free_mem,
211 &crash_size, &crash_base); 231 &crash_size, &crash_base);
@@ -229,33 +249,21 @@ static inline void __init reserve_crashkernel(void)
229{} 249{}
230#endif 250#endif
231 251
232#define EBDA_ADDR_POINTER 0x40E 252/* Overridden in paravirt.c if CONFIG_PARAVIRT */
233 253void __attribute__((weak)) __init memory_setup(void)
234unsigned __initdata ebda_addr;
235unsigned __initdata ebda_size;
236
237static void discover_ebda(void)
238{ 254{
239 /* 255 machine_specific_memory_setup();
240 * there is a real-mode segmented pointer pointing to the
241 * 4K EBDA area at 0x40E
242 */
243 ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
244 ebda_addr <<= 4;
245
246 ebda_size = *(unsigned short *)__va(ebda_addr);
247
248 /* Round EBDA up to pages */
249 if (ebda_size == 0)
250 ebda_size = 1;
251 ebda_size <<= 10;
252 ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
253 if (ebda_size > 64*1024)
254 ebda_size = 64*1024;
255} 256}
256 257
258/*
259 * setup_arch - architecture-specific boot-time initializations
260 *
261 * Note: On x86_64, fixmaps are ready for use even before this is called.
262 */
257void __init setup_arch(char **cmdline_p) 263void __init setup_arch(char **cmdline_p)
258{ 264{
265 unsigned i;
266
259 printk(KERN_INFO "Command line: %s\n", boot_command_line); 267 printk(KERN_INFO "Command line: %s\n", boot_command_line);
260 268
261 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 269 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -269,7 +277,15 @@ void __init setup_arch(char **cmdline_p)
269 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); 277 rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0);
270 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); 278 rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0);
271#endif 279#endif
272 setup_memory_region(); 280#ifdef CONFIG_EFI
281 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
282 "EL64", 4))
283 efi_enabled = 1;
284#endif
285
286 ARCH_SETUP
287
288 memory_setup();
273 copy_edd(); 289 copy_edd();
274 290
275 if (!boot_params.hdr.root_flags) 291 if (!boot_params.hdr.root_flags)
@@ -293,27 +309,47 @@ void __init setup_arch(char **cmdline_p)
293 309
294 parse_early_param(); 310 parse_early_param();
295 311
312#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
313 if (init_ohci1394_dma_early)
314 init_ohci1394_dma_on_all_controllers();
315#endif
316
296 finish_e820_parsing(); 317 finish_e820_parsing();
297 318
319 early_gart_iommu_check();
320
298 e820_register_active_regions(0, 0, -1UL); 321 e820_register_active_regions(0, 0, -1UL);
299 /* 322 /*
300 * partially used pages are not usable - thus 323 * partially used pages are not usable - thus
301 * we are rounding upwards: 324 * we are rounding upwards:
302 */ 325 */
303 end_pfn = e820_end_of_ram(); 326 end_pfn = e820_end_of_ram();
327 /* update e820 for memory not covered by WB MTRRs */
328 mtrr_bp_init();
329 if (mtrr_trim_uncached_memory(end_pfn)) {
330 e820_register_active_regions(0, 0, -1UL);
331 end_pfn = e820_end_of_ram();
332 }
333
304 num_physpages = end_pfn; 334 num_physpages = end_pfn;
305 335
306 check_efer(); 336 check_efer();
307 337
308 discover_ebda();
309
310 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); 338 init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
339 if (efi_enabled)
340 efi_init();
311 341
312 dmi_scan_machine(); 342 dmi_scan_machine();
313 343
344 io_delay_init();
345
314#ifdef CONFIG_SMP 346#ifdef CONFIG_SMP
315 /* setup to use the static apicid table during kernel startup */ 347 /* setup to use the early static init tables during kernel startup */
316 x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; 348 x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
349 x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
350#ifdef CONFIG_NUMA
351 x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
352#endif
317#endif 353#endif
318 354
319#ifdef CONFIG_ACPI 355#ifdef CONFIG_ACPI
@@ -340,48 +376,26 @@ void __init setup_arch(char **cmdline_p)
340#endif 376#endif
341 377
342#ifdef CONFIG_NUMA 378#ifdef CONFIG_NUMA
343 numa_initmem_init(0, end_pfn); 379 numa_initmem_init(0, end_pfn);
344#else 380#else
345 contig_initmem_init(0, end_pfn); 381 contig_initmem_init(0, end_pfn);
346#endif 382#endif
347 383
348 /* Reserve direct mapping */ 384 early_res_to_bootmem();
349 reserve_bootmem_generic(table_start << PAGE_SHIFT,
350 (table_end - table_start) << PAGE_SHIFT);
351
352 /* reserve kernel */
353 reserve_bootmem_generic(__pa_symbol(&_text),
354 __pa_symbol(&_end) - __pa_symbol(&_text));
355 385
386#ifdef CONFIG_ACPI_SLEEP
356 /* 387 /*
357 * reserve physical page 0 - it's a special BIOS page on many boxes, 388 * Reserve low memory region for sleep support.
358 * enabling clean reboots, SMP operation, laptop functions.
359 */ 389 */
360 reserve_bootmem_generic(0, PAGE_SIZE); 390 acpi_reserve_bootmem();
361
362 /* reserve ebda region */
363 if (ebda_addr)
364 reserve_bootmem_generic(ebda_addr, ebda_size);
365#ifdef CONFIG_NUMA
366 /* reserve nodemap region */
367 if (nodemap_addr)
368 reserve_bootmem_generic(nodemap_addr, nodemap_size);
369#endif 391#endif
370 392
371#ifdef CONFIG_SMP 393 if (efi_enabled)
372 /* Reserve SMP trampoline */ 394 efi_reserve_bootmem();
373 reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE);
374#endif
375 395
376#ifdef CONFIG_ACPI_SLEEP
377 /* 396 /*
378 * Reserve low memory region for sleep support. 397 * Find and reserve possible boot-time SMP configuration:
379 */ 398 */
380 acpi_reserve_bootmem();
381#endif
382 /*
383 * Find and reserve possible boot-time SMP configuration:
384 */
385 find_smp_config(); 399 find_smp_config();
386#ifdef CONFIG_BLK_DEV_INITRD 400#ifdef CONFIG_BLK_DEV_INITRD
387 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 401 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
@@ -395,6 +409,8 @@ void __init setup_arch(char **cmdline_p)
395 initrd_start = ramdisk_image + PAGE_OFFSET; 409 initrd_start = ramdisk_image + PAGE_OFFSET;
396 initrd_end = initrd_start+ramdisk_size; 410 initrd_end = initrd_start+ramdisk_size;
397 } else { 411 } else {
412 /* Assumes everything on node 0 */
413 free_bootmem(ramdisk_image, ramdisk_size);
398 printk(KERN_ERR "initrd extends beyond end of memory " 414 printk(KERN_ERR "initrd extends beyond end of memory "
399 "(0x%08lx > 0x%08lx)\ndisabling initrd\n", 415 "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
400 ramdisk_end, end_of_mem); 416 ramdisk_end, end_of_mem);
@@ -404,17 +420,10 @@ void __init setup_arch(char **cmdline_p)
404#endif 420#endif
405 reserve_crashkernel(); 421 reserve_crashkernel();
406 paging_init(); 422 paging_init();
423 map_vsyscall();
407 424
408#ifdef CONFIG_PCI
409 early_quirks(); 425 early_quirks();
410#endif
411 426
412 /*
413 * set this early, so we dont allocate cpu0
414 * if MADT list doesnt list BSP first
415 * mpparse.c/MP_processor_info() allocates logical cpu numbers.
416 */
417 cpu_set(0, cpu_present_map);
418#ifdef CONFIG_ACPI 427#ifdef CONFIG_ACPI
419 /* 428 /*
420 * Read APIC and some other early information from ACPI tables. 429 * Read APIC and some other early information from ACPI tables.
@@ -430,25 +439,24 @@ void __init setup_arch(char **cmdline_p)
430 if (smp_found_config) 439 if (smp_found_config)
431 get_smp_config(); 440 get_smp_config();
432 init_apic_mappings(); 441 init_apic_mappings();
442 ioapic_init_mappings();
433 443
434 /* 444 /*
435 * We trust e820 completely. No explicit ROM probing in memory. 445 * We trust e820 completely. No explicit ROM probing in memory.
436 */ 446 */
437 e820_reserve_resources(); 447 e820_reserve_resources(&code_resource, &data_resource, &bss_resource);
438 e820_mark_nosave_regions(); 448 e820_mark_nosave_regions();
439 449
440 {
441 unsigned i;
442 /* request I/O space for devices used on all i[345]86 PCs */ 450 /* request I/O space for devices used on all i[345]86 PCs */
443 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) 451 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
444 request_resource(&ioport_resource, &standard_io_resources[i]); 452 request_resource(&ioport_resource, &standard_io_resources[i]);
445 }
446 453
447 e820_setup_gap(); 454 e820_setup_gap();
448 455
449#ifdef CONFIG_VT 456#ifdef CONFIG_VT
450#if defined(CONFIG_VGA_CONSOLE) 457#if defined(CONFIG_VGA_CONSOLE)
451 conswitchp = &vga_con; 458 if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
459 conswitchp = &vga_con;
452#elif defined(CONFIG_DUMMY_CONSOLE) 460#elif defined(CONFIG_DUMMY_CONSOLE)
453 conswitchp = &dummy_con; 461 conswitchp = &dummy_con;
454#endif 462#endif
@@ -479,9 +487,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
479 487
480 if (n >= 0x80000005) { 488 if (n >= 0x80000005) {
481 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 489 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
482 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 490 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
483 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 491 "D cache %dK (%d bytes/line)\n",
484 c->x86_cache_size=(ecx>>24)+(edx>>24); 492 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
493 c->x86_cache_size = (ecx>>24) + (edx>>24);
485 /* On K8 L1 TLB is inclusive, so don't count it */ 494 /* On K8 L1 TLB is inclusive, so don't count it */
486 c->x86_tlbsize = 0; 495 c->x86_tlbsize = 0;
487 } 496 }
@@ -495,11 +504,8 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
495 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 504 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
496 c->x86_cache_size, ecx & 0xFF); 505 c->x86_cache_size, ecx & 0xFF);
497 } 506 }
498
499 if (n >= 0x80000007)
500 cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power);
501 if (n >= 0x80000008) { 507 if (n >= 0x80000008) {
502 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); 508 cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
503 c->x86_virt_bits = (eax >> 8) & 0xff; 509 c->x86_virt_bits = (eax >> 8) & 0xff;
504 c->x86_phys_bits = eax & 0xff; 510 c->x86_phys_bits = eax & 0xff;
505 } 511 }
@@ -508,14 +514,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
508#ifdef CONFIG_NUMA 514#ifdef CONFIG_NUMA
509static int nearby_node(int apicid) 515static int nearby_node(int apicid)
510{ 516{
511 int i; 517 int i, node;
518
512 for (i = apicid - 1; i >= 0; i--) { 519 for (i = apicid - 1; i >= 0; i--) {
513 int node = apicid_to_node[i]; 520 node = apicid_to_node[i];
514 if (node != NUMA_NO_NODE && node_online(node)) 521 if (node != NUMA_NO_NODE && node_online(node))
515 return node; 522 return node;
516 } 523 }
517 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 524 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
518 int node = apicid_to_node[i]; 525 node = apicid_to_node[i];
519 if (node != NUMA_NO_NODE && node_online(node)) 526 if (node != NUMA_NO_NODE && node_online(node))
520 return node; 527 return node;
521 } 528 }
@@ -527,7 +534,7 @@ static int nearby_node(int apicid)
527 * On a AMD dual core setup the lower bits of the APIC id distingush the cores. 534 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
528 * Assumes number of cores is a power of two. 535 * Assumes number of cores is a power of two.
529 */ 536 */
530static void __init amd_detect_cmp(struct cpuinfo_x86 *c) 537static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
531{ 538{
532#ifdef CONFIG_SMP 539#ifdef CONFIG_SMP
533 unsigned bits; 540 unsigned bits;
@@ -536,7 +543,54 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
536 int node = 0; 543 int node = 0;
537 unsigned apicid = hard_smp_processor_id(); 544 unsigned apicid = hard_smp_processor_id();
538#endif 545#endif
539 unsigned ecx = cpuid_ecx(0x80000008); 546 bits = c->x86_coreid_bits;
547
548 /* Low order bits define the core id (index of core in socket) */
549 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
550 /* Convert the APIC ID into the socket ID */
551 c->phys_proc_id = phys_pkg_id(bits);
552
553#ifdef CONFIG_NUMA
554 node = c->phys_proc_id;
555 if (apicid_to_node[apicid] != NUMA_NO_NODE)
556 node = apicid_to_node[apicid];
557 if (!node_online(node)) {
558 /* Two possibilities here:
559 - The CPU is missing memory and no node was created.
560 In that case try picking one from a nearby CPU
561 - The APIC IDs differ from the HyperTransport node IDs
562 which the K8 northbridge parsing fills in.
563 Assume they are all increased by a constant offset,
564 but in the same order as the HT nodeids.
565 If that doesn't result in a usable node fall back to the
566 path for the previous case. */
567
568 int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
569
570 if (ht_nodeid >= 0 &&
571 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
572 node = apicid_to_node[ht_nodeid];
573 /* Pick a nearby node */
574 if (!node_online(node))
575 node = nearby_node(apicid);
576 }
577 numa_set_node(cpu, node);
578
579 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
580#endif
581#endif
582}
583
584static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
585{
586#ifdef CONFIG_SMP
587 unsigned bits, ecx;
588
589 /* Multi core CPU? */
590 if (c->extended_cpuid_level < 0x80000008)
591 return;
592
593 ecx = cpuid_ecx(0x80000008);
540 594
541 c->x86_max_cores = (ecx & 0xff) + 1; 595 c->x86_max_cores = (ecx & 0xff) + 1;
542 596
@@ -549,37 +603,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
549 bits++; 603 bits++;
550 } 604 }
551 605
552 /* Low order bits define the core id (index of core in socket) */ 606 c->x86_coreid_bits = bits;
553 c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
554 /* Convert the APIC ID into the socket ID */
555 c->phys_proc_id = phys_pkg_id(bits);
556
557#ifdef CONFIG_NUMA
558 node = c->phys_proc_id;
559 if (apicid_to_node[apicid] != NUMA_NO_NODE)
560 node = apicid_to_node[apicid];
561 if (!node_online(node)) {
562 /* Two possibilities here:
563 - The CPU is missing memory and no node was created.
564 In that case try picking one from a nearby CPU
565 - The APIC IDs differ from the HyperTransport node IDs
566 which the K8 northbridge parsing fills in.
567 Assume they are all increased by a constant offset,
568 but in the same order as the HT nodeids.
569 If that doesn't result in a usable node fall back to the
570 path for the previous case. */
571 int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
572 if (ht_nodeid >= 0 &&
573 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
574 node = apicid_to_node[ht_nodeid];
575 /* Pick a nearby node */
576 if (!node_online(node))
577 node = nearby_node(apicid);
578 }
579 numa_set_node(cpu, node);
580 607
581 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
582#endif
583#endif 608#endif
584} 609}
585 610
@@ -595,8 +620,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
595/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ 620/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
596static __cpuinit int amd_apic_timer_broken(void) 621static __cpuinit int amd_apic_timer_broken(void)
597{ 622{
598 u32 lo, hi; 623 u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
599 u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); 624
600 switch (eax & CPUID_XFAM) { 625 switch (eax & CPUID_XFAM) {
601 case CPUID_XFAM_K8: 626 case CPUID_XFAM_K8:
602 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) 627 if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
@@ -614,6 +639,15 @@ static __cpuinit int amd_apic_timer_broken(void)
614 return 0; 639 return 0;
615} 640}
616 641
642static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
643{
644 early_init_amd_mc(c);
645
646 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
647 if (c->x86_power & (1<<8))
648 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
649}
650
617static void __cpuinit init_amd(struct cpuinfo_x86 *c) 651static void __cpuinit init_amd(struct cpuinfo_x86 *c)
618{ 652{
619 unsigned level; 653 unsigned level;
@@ -624,7 +658,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
624 /* 658 /*
625 * Disable TLB flush filter by setting HWCR.FFDIS on K8 659 * Disable TLB flush filter by setting HWCR.FFDIS on K8
626 * bit 6 of msr C001_0015 660 * bit 6 of msr C001_0015
627 * 661 *
628 * Errata 63 for SH-B3 steppings 662 * Errata 63 for SH-B3 steppings
629 * Errata 122 for all steppings (F+ have it disabled by default) 663 * Errata 122 for all steppings (F+ have it disabled by default)
630 */ 664 */
@@ -637,35 +671,32 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
637 671
638 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 672 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
639 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ 673 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
640 clear_bit(0*32+31, &c->x86_capability); 674 clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
641 675
642 /* On C+ stepping K8 rep microcode works well for copy/memset */ 676 /* On C+ stepping K8 rep microcode works well for copy/memset */
643 level = cpuid_eax(1); 677 level = cpuid_eax(1);
644 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) 678 if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
645 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 679 level >= 0x0f58))
680 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
646 if (c->x86 == 0x10 || c->x86 == 0x11) 681 if (c->x86 == 0x10 || c->x86 == 0x11)
647 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 682 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
648 683
649 /* Enable workaround for FXSAVE leak */ 684 /* Enable workaround for FXSAVE leak */
650 if (c->x86 >= 6) 685 if (c->x86 >= 6)
651 set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); 686 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
652 687
653 level = get_model_name(c); 688 level = get_model_name(c);
654 if (!level) { 689 if (!level) {
655 switch (c->x86) { 690 switch (c->x86) {
656 case 15: 691 case 15:
657 /* Should distinguish Models here, but this is only 692 /* Should distinguish Models here, but this is only
658 a fallback anyways. */ 693 a fallback anyways. */
659 strcpy(c->x86_model_id, "Hammer"); 694 strcpy(c->x86_model_id, "Hammer");
660 break; 695 break;
661 } 696 }
662 } 697 }
663 display_cacheinfo(c); 698 display_cacheinfo(c);
664 699
665 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
666 if (c->x86_power & (1<<8))
667 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
668
669 /* Multi core CPU? */ 700 /* Multi core CPU? */
670 if (c->extended_cpuid_level >= 0x80000008) 701 if (c->extended_cpuid_level >= 0x80000008)
671 amd_detect_cmp(c); 702 amd_detect_cmp(c);
@@ -677,41 +708,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
677 num_cache_leaves = 3; 708 num_cache_leaves = 3;
678 709
679 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) 710 if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
680 set_bit(X86_FEATURE_K8, &c->x86_capability); 711 set_cpu_cap(c, X86_FEATURE_K8);
681
682 /* RDTSC can be speculated around */
683 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
684 712
685 /* Family 10 doesn't support C states in MWAIT so don't use it */ 713 /* MFENCE stops RDTSC speculation */
686 if (c->x86 == 0x10 && !force_mwait) 714 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
687 clear_bit(X86_FEATURE_MWAIT, &c->x86_capability);
688 715
689 if (amd_apic_timer_broken()) 716 if (amd_apic_timer_broken())
690 disable_apic_timer = 1; 717 disable_apic_timer = 1;
691} 718}
692 719
693static void __cpuinit detect_ht(struct cpuinfo_x86 *c) 720void __cpuinit detect_ht(struct cpuinfo_x86 *c)
694{ 721{
695#ifdef CONFIG_SMP 722#ifdef CONFIG_SMP
696 u32 eax, ebx, ecx, edx; 723 u32 eax, ebx, ecx, edx;
697 int index_msb, core_bits; 724 int index_msb, core_bits;
698 725
699 cpuid(1, &eax, &ebx, &ecx, &edx); 726 cpuid(1, &eax, &ebx, &ecx, &edx);
700 727
701 728
702 if (!cpu_has(c, X86_FEATURE_HT)) 729 if (!cpu_has(c, X86_FEATURE_HT))
703 return; 730 return;
704 if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) 731 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
705 goto out; 732 goto out;
706 733
707 smp_num_siblings = (ebx & 0xff0000) >> 16; 734 smp_num_siblings = (ebx & 0xff0000) >> 16;
708 735
709 if (smp_num_siblings == 1) { 736 if (smp_num_siblings == 1) {
710 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 737 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
711 } else if (smp_num_siblings > 1 ) { 738 } else if (smp_num_siblings > 1) {
712 739
713 if (smp_num_siblings > NR_CPUS) { 740 if (smp_num_siblings > NR_CPUS) {
714 printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); 741 printk(KERN_WARNING "CPU: Unsupported number of "
742 "siblings %d", smp_num_siblings);
715 smp_num_siblings = 1; 743 smp_num_siblings = 1;
716 return; 744 return;
717 } 745 }
@@ -721,7 +749,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
721 749
722 smp_num_siblings = smp_num_siblings / c->x86_max_cores; 750 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
723 751
724 index_msb = get_count_order(smp_num_siblings) ; 752 index_msb = get_count_order(smp_num_siblings);
725 753
726 core_bits = get_count_order(c->x86_max_cores); 754 core_bits = get_count_order(c->x86_max_cores);
727 755
@@ -730,8 +758,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
730 } 758 }
731out: 759out:
732 if ((c->x86_max_cores * smp_num_siblings) > 1) { 760 if ((c->x86_max_cores * smp_num_siblings) > 1) {
733 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); 761 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
734 printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); 762 c->phys_proc_id);
763 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
764 c->cpu_core_id);
735 } 765 }
736 766
737#endif 767#endif
@@ -773,28 +803,39 @@ static void srat_detect_node(void)
773#endif 803#endif
774} 804}
775 805
806static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
807{
808 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
809 (c->x86 == 0x6 && c->x86_model >= 0x0e))
810 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
811}
812
776static void __cpuinit init_intel(struct cpuinfo_x86 *c) 813static void __cpuinit init_intel(struct cpuinfo_x86 *c)
777{ 814{
778 /* Cache sizes */ 815 /* Cache sizes */
779 unsigned n; 816 unsigned n;
780 817
781 init_intel_cacheinfo(c); 818 init_intel_cacheinfo(c);
782 if (c->cpuid_level > 9 ) { 819 if (c->cpuid_level > 9) {
783 unsigned eax = cpuid_eax(10); 820 unsigned eax = cpuid_eax(10);
784 /* Check for version and the number of counters */ 821 /* Check for version and the number of counters */
785 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) 822 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
786 set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); 823 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
787 } 824 }
788 825
789 if (cpu_has_ds) { 826 if (cpu_has_ds) {
790 unsigned int l1, l2; 827 unsigned int l1, l2;
791 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); 828 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
792 if (!(l1 & (1<<11))) 829 if (!(l1 & (1<<11)))
793 set_bit(X86_FEATURE_BTS, c->x86_capability); 830 set_cpu_cap(c, X86_FEATURE_BTS);
794 if (!(l1 & (1<<12))) 831 if (!(l1 & (1<<12)))
795 set_bit(X86_FEATURE_PEBS, c->x86_capability); 832 set_cpu_cap(c, X86_FEATURE_PEBS);
796 } 833 }
797 834
835
836 if (cpu_has_bts)
837 ds_init_intel(c);
838
798 n = c->extended_cpuid_level; 839 n = c->extended_cpuid_level;
799 if (n >= 0x80000008) { 840 if (n >= 0x80000008) {
800 unsigned eax = cpuid_eax(0x80000008); 841 unsigned eax = cpuid_eax(0x80000008);
@@ -811,14 +852,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
811 c->x86_cache_alignment = c->x86_clflush_size * 2; 852 c->x86_cache_alignment = c->x86_clflush_size * 2;
812 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 853 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
813 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 854 (c->x86 == 0x6 && c->x86_model >= 0x0e))
814 set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); 855 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
815 if (c->x86 == 6) 856 if (c->x86 == 6)
816 set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); 857 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
817 if (c->x86 == 15) 858 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
818 set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); 859 c->x86_max_cores = intel_num_cpu_cores(c);
819 else
820 clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
821 c->x86_max_cores = intel_num_cpu_cores(c);
822 860
823 srat_detect_node(); 861 srat_detect_node();
824} 862}
@@ -835,18 +873,12 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
835 c->x86_vendor = X86_VENDOR_UNKNOWN; 873 c->x86_vendor = X86_VENDOR_UNKNOWN;
836} 874}
837 875
838struct cpu_model_info {
839 int vendor;
840 int family;
841 char *model_names[16];
842};
843
844/* Do some early cpuid on the boot CPU to get some parameter that are 876/* Do some early cpuid on the boot CPU to get some parameter that are
845 needed before check_bugs. Everything advanced is in identify_cpu 877 needed before check_bugs. Everything advanced is in identify_cpu
846 below. */ 878 below. */
847void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) 879static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
848{ 880{
849 u32 tfms; 881 u32 tfms, xlvl;
850 882
851 c->loops_per_jiffy = loops_per_jiffy; 883 c->loops_per_jiffy = loops_per_jiffy;
852 c->x86_cache_size = -1; 884 c->x86_cache_size = -1;
@@ -857,6 +889,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
857 c->x86_clflush_size = 64; 889 c->x86_clflush_size = 64;
858 c->x86_cache_alignment = c->x86_clflush_size; 890 c->x86_cache_alignment = c->x86_clflush_size;
859 c->x86_max_cores = 1; 891 c->x86_max_cores = 1;
892 c->x86_coreid_bits = 0;
860 c->extended_cpuid_level = 0; 893 c->extended_cpuid_level = 0;
861 memset(&c->x86_capability, 0, sizeof c->x86_capability); 894 memset(&c->x86_capability, 0, sizeof c->x86_capability);
862 895
@@ -865,7 +898,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
865 (unsigned int *)&c->x86_vendor_id[0], 898 (unsigned int *)&c->x86_vendor_id[0],
866 (unsigned int *)&c->x86_vendor_id[8], 899 (unsigned int *)&c->x86_vendor_id[8],
867 (unsigned int *)&c->x86_vendor_id[4]); 900 (unsigned int *)&c->x86_vendor_id[4]);
868 901
869 get_cpu_vendor(c); 902 get_cpu_vendor(c);
870 903
871 /* Initialize the standard set of capabilities */ 904 /* Initialize the standard set of capabilities */
@@ -883,7 +916,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
883 c->x86 += (tfms >> 20) & 0xff; 916 c->x86 += (tfms >> 20) & 0xff;
884 if (c->x86 >= 0x6) 917 if (c->x86 >= 0x6)
885 c->x86_model += ((tfms >> 16) & 0xF) << 4; 918 c->x86_model += ((tfms >> 16) & 0xF) << 4;
886 if (c->x86_capability[0] & (1<<19)) 919 if (c->x86_capability[0] & (1<<19))
887 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 920 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
888 } else { 921 } else {
889 /* Have CPUID level 0 only - unheard of */ 922 /* Have CPUID level 0 only - unheard of */
@@ -893,18 +926,6 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
893#ifdef CONFIG_SMP 926#ifdef CONFIG_SMP
894 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; 927 c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
895#endif 928#endif
896}
897
898/*
899 * This does the hard work of actually picking apart the CPU stuff...
900 */
901void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
902{
903 int i;
904 u32 xlvl;
905
906 early_identify_cpu(c);
907
908 /* AMD-defined flags: level 0x80000001 */ 929 /* AMD-defined flags: level 0x80000001 */
909 xlvl = cpuid_eax(0x80000000); 930 xlvl = cpuid_eax(0x80000000);
910 c->extended_cpuid_level = xlvl; 931 c->extended_cpuid_level = xlvl;
@@ -925,6 +946,30 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
925 c->x86_capability[2] = cpuid_edx(0x80860001); 946 c->x86_capability[2] = cpuid_edx(0x80860001);
926 } 947 }
927 948
949 c->extended_cpuid_level = cpuid_eax(0x80000000);
950 if (c->extended_cpuid_level >= 0x80000007)
951 c->x86_power = cpuid_edx(0x80000007);
952
953 switch (c->x86_vendor) {
954 case X86_VENDOR_AMD:
955 early_init_amd(c);
956 break;
957 case X86_VENDOR_INTEL:
958 early_init_intel(c);
959 break;
960 }
961
962}
963
964/*
965 * This does the hard work of actually picking apart the CPU stuff...
966 */
967void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
968{
969 int i;
970
971 early_identify_cpu(c);
972
928 init_scattered_cpuid_features(c); 973 init_scattered_cpuid_features(c);
929 974
930 c->apicid = phys_pkg_id(0); 975 c->apicid = phys_pkg_id(0);
@@ -954,8 +999,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
954 break; 999 break;
955 } 1000 }
956 1001
957 select_idle_routine(c); 1002 detect_ht(c);
958 detect_ht(c);
959 1003
960 /* 1004 /*
961 * On SMP, boot_cpu_data holds the common feature set between 1005 * On SMP, boot_cpu_data holds the common feature set between
@@ -965,31 +1009,55 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
965 */ 1009 */
966 if (c != &boot_cpu_data) { 1010 if (c != &boot_cpu_data) {
967 /* AND the already accumulated flags with these */ 1011 /* AND the already accumulated flags with these */
968 for (i = 0 ; i < NCAPINTS ; i++) 1012 for (i = 0; i < NCAPINTS; i++)
969 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 1013 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
970 } 1014 }
971 1015
1016 /* Clear all flags overriden by options */
1017 for (i = 0; i < NCAPINTS; i++)
1018 c->x86_capability[i] ^= cleared_cpu_caps[i];
1019
972#ifdef CONFIG_X86_MCE 1020#ifdef CONFIG_X86_MCE
973 mcheck_init(c); 1021 mcheck_init(c);
974#endif 1022#endif
1023 select_idle_routine(c);
1024
975 if (c != &boot_cpu_data) 1025 if (c != &boot_cpu_data)
976 mtrr_ap_init(); 1026 mtrr_ap_init();
977#ifdef CONFIG_NUMA 1027#ifdef CONFIG_NUMA
978 numa_add_cpu(smp_processor_id()); 1028 numa_add_cpu(smp_processor_id());
979#endif 1029#endif
1030
980} 1031}
981 1032
1033static __init int setup_noclflush(char *arg)
1034{
1035 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
1036 return 1;
1037}
1038__setup("noclflush", setup_noclflush);
982 1039
983void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) 1040void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
984{ 1041{
985 if (c->x86_model_id[0]) 1042 if (c->x86_model_id[0])
986 printk("%s", c->x86_model_id); 1043 printk(KERN_INFO "%s", c->x86_model_id);
987 1044
988 if (c->x86_mask || c->cpuid_level >= 0) 1045 if (c->x86_mask || c->cpuid_level >= 0)
989 printk(" stepping %02x\n", c->x86_mask); 1046 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
990 else 1047 else
991 printk("\n"); 1048 printk(KERN_CONT "\n");
1049}
1050
1051static __init int setup_disablecpuid(char *arg)
1052{
1053 int bit;
1054 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
1055 setup_clear_cpu_cap(bit);
1056 else
1057 return 0;
1058 return 1;
992} 1059}
1060__setup("clearcpuid=", setup_disablecpuid);
993 1061
994/* 1062/*
995 * Get CPU information for use by the procfs. 1063 * Get CPU information for use by the procfs.
@@ -998,116 +1066,41 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
998static int show_cpuinfo(struct seq_file *m, void *v) 1066static int show_cpuinfo(struct seq_file *m, void *v)
999{ 1067{
1000 struct cpuinfo_x86 *c = v; 1068 struct cpuinfo_x86 *c = v;
1001 int cpu = 0; 1069 int cpu = 0, i;
1002
1003 /*
1004 * These flag bits must match the definitions in <asm/cpufeature.h>.
1005 * NULL means this bit is undefined or reserved; either way it doesn't
1006 * have meaning as far as Linux is concerned. Note that it's important
1007 * to realize there is a difference between this table and CPUID -- if
1008 * applications want to get the raw CPUID data, they should access
1009 * /dev/cpu/<cpu_nr>/cpuid instead.
1010 */
1011 static const char *const x86_cap_flags[] = {
1012 /* Intel-defined */
1013 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
1014 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
1015 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
1016 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
1017
1018 /* AMD-defined */
1019 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1020 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
1021 NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
1022 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
1023 "3dnowext", "3dnow",
1024
1025 /* Transmeta-defined */
1026 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
1027 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1028 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1029 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1030
1031 /* Other (Linux-defined) */
1032 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
1033 NULL, NULL, NULL, NULL,
1034 "constant_tsc", "up", NULL, "arch_perfmon",
1035 "pebs", "bts", NULL, "sync_rdtsc",
1036 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1037 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1038
1039 /* Intel-defined (#2) */
1040 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
1041 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
1042 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
1043 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1044
1045 /* VIA/Cyrix/Centaur-defined */
1046 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
1047 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
1048 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1049 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1050
1051 /* AMD-defined (#2) */
1052 "lahf_lm", "cmp_legacy", "svm", "extapic",
1053 "cr8_legacy", "abm", "sse4a", "misalignsse",
1054 "3dnowprefetch", "osvw", "ibs", "sse5",
1055 "skinit", "wdt", NULL, NULL,
1056 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1057 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1058
1059 /* Auxiliary (Linux-defined) */
1060 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1061 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1062 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1063 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
1064 };
1065 static const char *const x86_power_flags[] = {
1066 "ts", /* temperature sensor */
1067 "fid", /* frequency id control */
1068 "vid", /* voltage id control */
1069 "ttp", /* thermal trip */
1070 "tm",
1071 "stc",
1072 "100mhzsteps",
1073 "hwpstate",
1074 "", /* tsc invariant mapped to constant_tsc */
1075 /* nothing */
1076 };
1077
1078 1070
1079#ifdef CONFIG_SMP 1071#ifdef CONFIG_SMP
1080 cpu = c->cpu_index; 1072 cpu = c->cpu_index;
1081#endif 1073#endif
1082 1074
1083 seq_printf(m,"processor\t: %u\n" 1075 seq_printf(m, "processor\t: %u\n"
1084 "vendor_id\t: %s\n" 1076 "vendor_id\t: %s\n"
1085 "cpu family\t: %d\n" 1077 "cpu family\t: %d\n"
1086 "model\t\t: %d\n" 1078 "model\t\t: %d\n"
1087 "model name\t: %s\n", 1079 "model name\t: %s\n",
1088 (unsigned)cpu, 1080 (unsigned)cpu,
1089 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", 1081 c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
1090 c->x86, 1082 c->x86,
1091 (int)c->x86_model, 1083 (int)c->x86_model,
1092 c->x86_model_id[0] ? c->x86_model_id : "unknown"); 1084 c->x86_model_id[0] ? c->x86_model_id : "unknown");
1093 1085
1094 if (c->x86_mask || c->cpuid_level >= 0) 1086 if (c->x86_mask || c->cpuid_level >= 0)
1095 seq_printf(m, "stepping\t: %d\n", c->x86_mask); 1087 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
1096 else 1088 else
1097 seq_printf(m, "stepping\t: unknown\n"); 1089 seq_printf(m, "stepping\t: unknown\n");
1098 1090
1099 if (cpu_has(c,X86_FEATURE_TSC)) { 1091 if (cpu_has(c, X86_FEATURE_TSC)) {
1100 unsigned int freq = cpufreq_quick_get((unsigned)cpu); 1092 unsigned int freq = cpufreq_quick_get((unsigned)cpu);
1093
1101 if (!freq) 1094 if (!freq)
1102 freq = cpu_khz; 1095 freq = cpu_khz;
1103 seq_printf(m, "cpu MHz\t\t: %u.%03u\n", 1096 seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
1104 freq / 1000, (freq % 1000)); 1097 freq / 1000, (freq % 1000));
1105 } 1098 }
1106 1099
1107 /* Cache size */ 1100 /* Cache size */
1108 if (c->x86_cache_size >= 0) 1101 if (c->x86_cache_size >= 0)
1109 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); 1102 seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
1110 1103
1111#ifdef CONFIG_SMP 1104#ifdef CONFIG_SMP
1112 if (smp_num_siblings * c->x86_max_cores > 1) { 1105 if (smp_num_siblings * c->x86_max_cores > 1) {
1113 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); 1106 seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
@@ -1116,48 +1109,43 @@ static int show_cpuinfo(struct seq_file *m, void *v)
1116 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); 1109 seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
1117 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); 1110 seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
1118 } 1111 }
1119#endif 1112#endif
1120 1113
1121 seq_printf(m, 1114 seq_printf(m,
1122 "fpu\t\t: yes\n" 1115 "fpu\t\t: yes\n"
1123 "fpu_exception\t: yes\n" 1116 "fpu_exception\t: yes\n"
1124 "cpuid level\t: %d\n" 1117 "cpuid level\t: %d\n"
1125 "wp\t\t: yes\n" 1118 "wp\t\t: yes\n"
1126 "flags\t\t:", 1119 "flags\t\t:",
1127 c->cpuid_level); 1120 c->cpuid_level);
1128 1121
1129 { 1122 for (i = 0; i < 32*NCAPINTS; i++)
1130 int i; 1123 if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
1131 for ( i = 0 ; i < 32*NCAPINTS ; i++ ) 1124 seq_printf(m, " %s", x86_cap_flags[i]);
1132 if (cpu_has(c, i) && x86_cap_flags[i] != NULL) 1125
1133 seq_printf(m, " %s", x86_cap_flags[i]);
1134 }
1135
1136 seq_printf(m, "\nbogomips\t: %lu.%02lu\n", 1126 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
1137 c->loops_per_jiffy/(500000/HZ), 1127 c->loops_per_jiffy/(500000/HZ),
1138 (c->loops_per_jiffy/(5000/HZ)) % 100); 1128 (c->loops_per_jiffy/(5000/HZ)) % 100);
1139 1129
1140 if (c->x86_tlbsize > 0) 1130 if (c->x86_tlbsize > 0)
1141 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); 1131 seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
1142 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); 1132 seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
1143 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); 1133 seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
1144 1134
1145 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", 1135 seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
1146 c->x86_phys_bits, c->x86_virt_bits); 1136 c->x86_phys_bits, c->x86_virt_bits);
1147 1137
1148 seq_printf(m, "power management:"); 1138 seq_printf(m, "power management:");
1149 { 1139 for (i = 0; i < 32; i++) {
1150 unsigned i; 1140 if (c->x86_power & (1 << i)) {
1151 for (i = 0; i < 32; i++) 1141 if (i < ARRAY_SIZE(x86_power_flags) &&
1152 if (c->x86_power & (1 << i)) { 1142 x86_power_flags[i])
1153 if (i < ARRAY_SIZE(x86_power_flags) && 1143 seq_printf(m, "%s%s",
1154 x86_power_flags[i]) 1144 x86_power_flags[i][0]?" ":"",
1155 seq_printf(m, "%s%s", 1145 x86_power_flags[i]);
1156 x86_power_flags[i][0]?" ":"", 1146 else
1157 x86_power_flags[i]); 1147 seq_printf(m, " [%d]", i);
1158 else 1148 }
1159 seq_printf(m, " [%d]", i);
1160 }
1161 } 1149 }
1162 1150
1163 seq_printf(m, "\n\n"); 1151 seq_printf(m, "\n\n");
@@ -1184,8 +1172,8 @@ static void c_stop(struct seq_file *m, void *v)
1184{ 1172{
1185} 1173}
1186 1174
1187struct seq_operations cpuinfo_op = { 1175const struct seq_operations cpuinfo_op = {
1188 .start =c_start, 1176 .start = c_start,
1189 .next = c_next, 1177 .next = c_next,
1190 .stop = c_stop, 1178 .stop = c_stop,
1191 .show = show_cpuinfo, 1179 .show = show_cpuinfo,
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 9bdd83022f5f..caee1f002fed 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -23,6 +23,7 @@
23#include <asm/ucontext.h> 23#include <asm/ucontext.h>
24#include <asm/uaccess.h> 24#include <asm/uaccess.h>
25#include <asm/i387.h> 25#include <asm/i387.h>
26#include <asm/vdso.h>
26#include "sigframe_32.h" 27#include "sigframe_32.h"
27 28
28#define DEBUG_SIG 0 29#define DEBUG_SIG 0
@@ -81,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
81} 82}
82 83
83asmlinkage int 84asmlinkage int
84sys_sigaltstack(unsigned long ebx) 85sys_sigaltstack(unsigned long bx)
85{ 86{
86 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ 87 /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
87 struct pt_regs *regs = (struct pt_regs *)&ebx; 88 struct pt_regs *regs = (struct pt_regs *)&bx;
88 const stack_t __user *uss = (const stack_t __user *)ebx; 89 const stack_t __user *uss = (const stack_t __user *)bx;
89 stack_t __user *uoss = (stack_t __user *)regs->ecx; 90 stack_t __user *uoss = (stack_t __user *)regs->cx;
90 91
91 return do_sigaltstack(uss, uoss, regs->esp); 92 return do_sigaltstack(uss, uoss, regs->sp);
92} 93}
93 94
94 95
@@ -109,12 +110,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
109#define COPY_SEG(seg) \ 110#define COPY_SEG(seg) \
110 { unsigned short tmp; \ 111 { unsigned short tmp; \
111 err |= __get_user(tmp, &sc->seg); \ 112 err |= __get_user(tmp, &sc->seg); \
112 regs->x##seg = tmp; } 113 regs->seg = tmp; }
113 114
114#define COPY_SEG_STRICT(seg) \ 115#define COPY_SEG_STRICT(seg) \
115 { unsigned short tmp; \ 116 { unsigned short tmp; \
116 err |= __get_user(tmp, &sc->seg); \ 117 err |= __get_user(tmp, &sc->seg); \
117 regs->x##seg = tmp|3; } 118 regs->seg = tmp|3; }
118 119
119#define GET_SEG(seg) \ 120#define GET_SEG(seg) \
120 { unsigned short tmp; \ 121 { unsigned short tmp; \
@@ -130,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
130 COPY_SEG(fs); 131 COPY_SEG(fs);
131 COPY_SEG(es); 132 COPY_SEG(es);
132 COPY_SEG(ds); 133 COPY_SEG(ds);
133 COPY(edi); 134 COPY(di);
134 COPY(esi); 135 COPY(si);
135 COPY(ebp); 136 COPY(bp);
136 COPY(esp); 137 COPY(sp);
137 COPY(ebx); 138 COPY(bx);
138 COPY(edx); 139 COPY(dx);
139 COPY(ecx); 140 COPY(cx);
140 COPY(eip); 141 COPY(ip);
141 COPY_SEG_STRICT(cs); 142 COPY_SEG_STRICT(cs);
142 COPY_SEG_STRICT(ss); 143 COPY_SEG_STRICT(ss);
143 144
144 { 145 {
145 unsigned int tmpflags; 146 unsigned int tmpflags;
146 err |= __get_user(tmpflags, &sc->eflags); 147 err |= __get_user(tmpflags, &sc->flags);
147 regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); 148 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
148 regs->orig_eax = -1; /* disable syscall checks */ 149 regs->orig_ax = -1; /* disable syscall checks */
149 } 150 }
150 151
151 { 152 {
@@ -164,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
164 } 165 }
165 } 166 }
166 167
167 err |= __get_user(*peax, &sc->eax); 168 err |= __get_user(*peax, &sc->ax);
168 return err; 169 return err;
169 170
170badframe: 171badframe:
@@ -174,9 +175,9 @@ badframe:
174asmlinkage int sys_sigreturn(unsigned long __unused) 175asmlinkage int sys_sigreturn(unsigned long __unused)
175{ 176{
176 struct pt_regs *regs = (struct pt_regs *) &__unused; 177 struct pt_regs *regs = (struct pt_regs *) &__unused;
177 struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); 178 struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8);
178 sigset_t set; 179 sigset_t set;
179 int eax; 180 int ax;
180 181
181 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 182 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
182 goto badframe; 183 goto badframe;
@@ -192,17 +193,20 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
192 recalc_sigpending(); 193 recalc_sigpending();
193 spin_unlock_irq(&current->sighand->siglock); 194 spin_unlock_irq(&current->sighand->siglock);
194 195
195 if (restore_sigcontext(regs, &frame->sc, &eax)) 196 if (restore_sigcontext(regs, &frame->sc, &ax))
196 goto badframe; 197 goto badframe;
197 return eax; 198 return ax;
198 199
199badframe: 200badframe:
200 if (show_unhandled_signals && printk_ratelimit()) 201 if (show_unhandled_signals && printk_ratelimit()) {
201 printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" 202 printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
202 " esp:%lx oeax:%lx\n", 203 " sp:%lx oeax:%lx",
203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 204 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
204 current->comm, task_pid_nr(current), frame, regs->eip, 205 current->comm, task_pid_nr(current), frame, regs->ip,
205 regs->esp, regs->orig_eax); 206 regs->sp, regs->orig_ax);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
206 210
207 force_sig(SIGSEGV, current); 211 force_sig(SIGSEGV, current);
208 return 0; 212 return 0;
@@ -211,9 +215,9 @@ badframe:
211asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215asmlinkage int sys_rt_sigreturn(unsigned long __unused)
212{ 216{
213 struct pt_regs *regs = (struct pt_regs *) &__unused; 217 struct pt_regs *regs = (struct pt_regs *) &__unused;
214 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); 218 struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4);
215 sigset_t set; 219 sigset_t set;
216 int eax; 220 int ax;
217 221
218 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 222 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
219 goto badframe; 223 goto badframe;
@@ -226,13 +230,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
226 recalc_sigpending(); 230 recalc_sigpending();
227 spin_unlock_irq(&current->sighand->siglock); 231 spin_unlock_irq(&current->sighand->siglock);
228 232
229 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 233 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
230 goto badframe; 234 goto badframe;
231 235
232 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) 236 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
233 goto badframe; 237 goto badframe;
234 238
235 return eax; 239 return ax;
236 240
237badframe: 241badframe:
238 force_sig(SIGSEGV, current); 242 force_sig(SIGSEGV, current);
@@ -249,27 +253,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
249{ 253{
250 int tmp, err = 0; 254 int tmp, err = 0;
251 255
252 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); 256 err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs);
253 savesegment(gs, tmp); 257 savesegment(gs, tmp);
254 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 258 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
255 259
256 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); 260 err |= __put_user(regs->es, (unsigned int __user *)&sc->es);
257 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); 261 err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds);
258 err |= __put_user(regs->edi, &sc->edi); 262 err |= __put_user(regs->di, &sc->di);
259 err |= __put_user(regs->esi, &sc->esi); 263 err |= __put_user(regs->si, &sc->si);
260 err |= __put_user(regs->ebp, &sc->ebp); 264 err |= __put_user(regs->bp, &sc->bp);
261 err |= __put_user(regs->esp, &sc->esp); 265 err |= __put_user(regs->sp, &sc->sp);
262 err |= __put_user(regs->ebx, &sc->ebx); 266 err |= __put_user(regs->bx, &sc->bx);
263 err |= __put_user(regs->edx, &sc->edx); 267 err |= __put_user(regs->dx, &sc->dx);
264 err |= __put_user(regs->ecx, &sc->ecx); 268 err |= __put_user(regs->cx, &sc->cx);
265 err |= __put_user(regs->eax, &sc->eax); 269 err |= __put_user(regs->ax, &sc->ax);
266 err |= __put_user(current->thread.trap_no, &sc->trapno); 270 err |= __put_user(current->thread.trap_no, &sc->trapno);
267 err |= __put_user(current->thread.error_code, &sc->err); 271 err |= __put_user(current->thread.error_code, &sc->err);
268 err |= __put_user(regs->eip, &sc->eip); 272 err |= __put_user(regs->ip, &sc->ip);
269 err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); 273 err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs);
270 err |= __put_user(regs->eflags, &sc->eflags); 274 err |= __put_user(regs->flags, &sc->flags);
271 err |= __put_user(regs->esp, &sc->esp_at_signal); 275 err |= __put_user(regs->sp, &sc->sp_at_signal);
272 err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); 276 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
273 277
274 tmp = save_i387(fpstate); 278 tmp = save_i387(fpstate);
275 if (tmp < 0) 279 if (tmp < 0)
@@ -290,29 +294,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
290static inline void __user * 294static inline void __user *
291get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) 295get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
292{ 296{
293 unsigned long esp; 297 unsigned long sp;
294 298
295 /* Default to using normal stack */ 299 /* Default to using normal stack */
296 esp = regs->esp; 300 sp = regs->sp;
301
302 /*
303 * If we are on the alternate signal stack and would overflow it, don't.
304 * Return an always-bogus address instead so we will die with SIGSEGV.
305 */
306 if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size)))
307 return (void __user *) -1L;
297 308
298 /* This is the X/Open sanctioned signal stack switching. */ 309 /* This is the X/Open sanctioned signal stack switching. */
299 if (ka->sa.sa_flags & SA_ONSTACK) { 310 if (ka->sa.sa_flags & SA_ONSTACK) {
300 if (sas_ss_flags(esp) == 0) 311 if (sas_ss_flags(sp) == 0)
301 esp = current->sas_ss_sp + current->sas_ss_size; 312 sp = current->sas_ss_sp + current->sas_ss_size;
302 } 313 }
303 314
304 /* This is the legacy signal stack switching. */ 315 /* This is the legacy signal stack switching. */
305 else if ((regs->xss & 0xffff) != __USER_DS && 316 else if ((regs->ss & 0xffff) != __USER_DS &&
306 !(ka->sa.sa_flags & SA_RESTORER) && 317 !(ka->sa.sa_flags & SA_RESTORER) &&
307 ka->sa.sa_restorer) { 318 ka->sa.sa_restorer) {
308 esp = (unsigned long) ka->sa.sa_restorer; 319 sp = (unsigned long) ka->sa.sa_restorer;
309 } 320 }
310 321
311 esp -= frame_size; 322 sp -= frame_size;
312 /* Align the stack pointer according to the i386 ABI, 323 /* Align the stack pointer according to the i386 ABI,
313 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 324 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
314 esp = ((esp + 4) & -16ul) - 4; 325 sp = ((sp + 4) & -16ul) - 4;
315 return (void __user *) esp; 326 return (void __user *) sp;
316} 327}
317 328
318/* These symbols are defined with the addresses in the vsyscall page. 329/* These symbols are defined with the addresses in the vsyscall page.
@@ -355,9 +366,9 @@ static int setup_frame(int sig, struct k_sigaction *ka,
355 } 366 }
356 367
357 if (current->binfmt->hasvdso) 368 if (current->binfmt->hasvdso)
358 restorer = (void *)VDSO_SYM(&__kernel_sigreturn); 369 restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
359 else 370 else
360 restorer = (void *)&frame->retcode; 371 restorer = &frame->retcode;
361 if (ka->sa.sa_flags & SA_RESTORER) 372 if (ka->sa.sa_flags & SA_RESTORER)
362 restorer = ka->sa.sa_restorer; 373 restorer = ka->sa.sa_restorer;
363 374
@@ -379,16 +390,16 @@ static int setup_frame(int sig, struct k_sigaction *ka,
379 goto give_sigsegv; 390 goto give_sigsegv;
380 391
381 /* Set up registers for signal handler */ 392 /* Set up registers for signal handler */
382 regs->esp = (unsigned long) frame; 393 regs->sp = (unsigned long) frame;
383 regs->eip = (unsigned long) ka->sa.sa_handler; 394 regs->ip = (unsigned long) ka->sa.sa_handler;
384 regs->eax = (unsigned long) sig; 395 regs->ax = (unsigned long) sig;
385 regs->edx = (unsigned long) 0; 396 regs->dx = (unsigned long) 0;
386 regs->ecx = (unsigned long) 0; 397 regs->cx = (unsigned long) 0;
387 398
388 regs->xds = __USER_DS; 399 regs->ds = __USER_DS;
389 regs->xes = __USER_DS; 400 regs->es = __USER_DS;
390 regs->xss = __USER_DS; 401 regs->ss = __USER_DS;
391 regs->xcs = __USER_CS; 402 regs->cs = __USER_CS;
392 403
393 /* 404 /*
394 * Clear TF when entering the signal handler, but 405 * Clear TF when entering the signal handler, but
@@ -396,13 +407,13 @@ static int setup_frame(int sig, struct k_sigaction *ka,
396 * The tracer may want to single-step inside the 407 * The tracer may want to single-step inside the
397 * handler too. 408 * handler too.
398 */ 409 */
399 regs->eflags &= ~TF_MASK; 410 regs->flags &= ~TF_MASK;
400 if (test_thread_flag(TIF_SINGLESTEP)) 411 if (test_thread_flag(TIF_SINGLESTEP))
401 ptrace_notify(SIGTRAP); 412 ptrace_notify(SIGTRAP);
402 413
403#if DEBUG_SIG 414#if DEBUG_SIG
404 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 415 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
405 current->comm, current->pid, frame, regs->eip, frame->pretcode); 416 current->comm, current->pid, frame, regs->ip, frame->pretcode);
406#endif 417#endif
407 418
408 return 0; 419 return 0;
@@ -442,7 +453,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
442 err |= __put_user(0, &frame->uc.uc_flags); 453 err |= __put_user(0, &frame->uc.uc_flags);
443 err |= __put_user(0, &frame->uc.uc_link); 454 err |= __put_user(0, &frame->uc.uc_link);
444 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 455 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
445 err |= __put_user(sas_ss_flags(regs->esp), 456 err |= __put_user(sas_ss_flags(regs->sp),
446 &frame->uc.uc_stack.ss_flags); 457 &frame->uc.uc_stack.ss_flags);
447 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 458 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
448 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 459 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
@@ -452,13 +463,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
452 goto give_sigsegv; 463 goto give_sigsegv;
453 464
454 /* Set up to return from userspace. */ 465 /* Set up to return from userspace. */
455 restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); 466 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
456 if (ka->sa.sa_flags & SA_RESTORER) 467 if (ka->sa.sa_flags & SA_RESTORER)
457 restorer = ka->sa.sa_restorer; 468 restorer = ka->sa.sa_restorer;
458 err |= __put_user(restorer, &frame->pretcode); 469 err |= __put_user(restorer, &frame->pretcode);
459 470
460 /* 471 /*
461 * This is movl $,%eax ; int $0x80 472 * This is movl $,%ax ; int $0x80
462 * 473 *
463 * WE DO NOT USE IT ANY MORE! It's only left here for historical 474 * WE DO NOT USE IT ANY MORE! It's only left here for historical
464 * reasons and because gdb uses it as a signature to notice 475 * reasons and because gdb uses it as a signature to notice
@@ -472,16 +483,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
472 goto give_sigsegv; 483 goto give_sigsegv;
473 484
474 /* Set up registers for signal handler */ 485 /* Set up registers for signal handler */
475 regs->esp = (unsigned long) frame; 486 regs->sp = (unsigned long) frame;
476 regs->eip = (unsigned long) ka->sa.sa_handler; 487 regs->ip = (unsigned long) ka->sa.sa_handler;
477 regs->eax = (unsigned long) usig; 488 regs->ax = (unsigned long) usig;
478 regs->edx = (unsigned long) &frame->info; 489 regs->dx = (unsigned long) &frame->info;
479 regs->ecx = (unsigned long) &frame->uc; 490 regs->cx = (unsigned long) &frame->uc;
480 491
481 regs->xds = __USER_DS; 492 regs->ds = __USER_DS;
482 regs->xes = __USER_DS; 493 regs->es = __USER_DS;
483 regs->xss = __USER_DS; 494 regs->ss = __USER_DS;
484 regs->xcs = __USER_CS; 495 regs->cs = __USER_CS;
485 496
486 /* 497 /*
487 * Clear TF when entering the signal handler, but 498 * Clear TF when entering the signal handler, but
@@ -489,13 +500,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
489 * The tracer may want to single-step inside the 500 * The tracer may want to single-step inside the
490 * handler too. 501 * handler too.
491 */ 502 */
492 regs->eflags &= ~TF_MASK; 503 regs->flags &= ~TF_MASK;
493 if (test_thread_flag(TIF_SINGLESTEP)) 504 if (test_thread_flag(TIF_SINGLESTEP))
494 ptrace_notify(SIGTRAP); 505 ptrace_notify(SIGTRAP);
495 506
496#if DEBUG_SIG 507#if DEBUG_SIG
497 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", 508 printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
498 current->comm, current->pid, frame, regs->eip, frame->pretcode); 509 current->comm, current->pid, frame, regs->ip, frame->pretcode);
499#endif 510#endif
500 511
501 return 0; 512 return 0;
@@ -516,35 +527,33 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
516 int ret; 527 int ret;
517 528
518 /* Are we from a system call? */ 529 /* Are we from a system call? */
519 if (regs->orig_eax >= 0) { 530 if (regs->orig_ax >= 0) {
520 /* If so, check system call restarting.. */ 531 /* If so, check system call restarting.. */
521 switch (regs->eax) { 532 switch (regs->ax) {
522 case -ERESTART_RESTARTBLOCK: 533 case -ERESTART_RESTARTBLOCK:
523 case -ERESTARTNOHAND: 534 case -ERESTARTNOHAND:
524 regs->eax = -EINTR; 535 regs->ax = -EINTR;
525 break; 536 break;
526 537
527 case -ERESTARTSYS: 538 case -ERESTARTSYS:
528 if (!(ka->sa.sa_flags & SA_RESTART)) { 539 if (!(ka->sa.sa_flags & SA_RESTART)) {
529 regs->eax = -EINTR; 540 regs->ax = -EINTR;
530 break; 541 break;
531 } 542 }
532 /* fallthrough */ 543 /* fallthrough */
533 case -ERESTARTNOINTR: 544 case -ERESTARTNOINTR:
534 regs->eax = regs->orig_eax; 545 regs->ax = regs->orig_ax;
535 regs->eip -= 2; 546 regs->ip -= 2;
536 } 547 }
537 } 548 }
538 549
539 /* 550 /*
540 * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so 551 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
541 * that register information in the sigcontext is correct. 552 * flag so that register information in the sigcontext is correct.
542 */ 553 */
543 if (unlikely(regs->eflags & TF_MASK) 554 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
544 && likely(current->ptrace & PT_DTRACE)) { 555 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
545 current->ptrace &= ~PT_DTRACE; 556 regs->flags &= ~X86_EFLAGS_TF;
546 regs->eflags &= ~TF_MASK;
547 }
548 557
549 /* Set up the stack frame */ 558 /* Set up the stack frame */
550 if (ka->sa.sa_flags & SA_SIGINFO) 559 if (ka->sa.sa_flags & SA_SIGINFO)
@@ -569,7 +578,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
569 * want to handle. Thus you cannot kill init even with a SIGKILL even by 578 * want to handle. Thus you cannot kill init even with a SIGKILL even by
570 * mistake. 579 * mistake.
571 */ 580 */
572static void fastcall do_signal(struct pt_regs *regs) 581static void do_signal(struct pt_regs *regs)
573{ 582{
574 siginfo_t info; 583 siginfo_t info;
575 int signr; 584 int signr;
@@ -599,8 +608,8 @@ static void fastcall do_signal(struct pt_regs *regs)
599 * have been cleared if the watchpoint triggered 608 * have been cleared if the watchpoint triggered
600 * inside the kernel. 609 * inside the kernel.
601 */ 610 */
602 if (unlikely(current->thread.debugreg[7])) 611 if (unlikely(current->thread.debugreg7))
603 set_debugreg(current->thread.debugreg[7], 7); 612 set_debugreg(current->thread.debugreg7, 7);
604 613
605 /* Whee! Actually deliver the signal. */ 614 /* Whee! Actually deliver the signal. */
606 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 615 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
@@ -616,19 +625,19 @@ static void fastcall do_signal(struct pt_regs *regs)
616 } 625 }
617 626
618 /* Did we come from a system call? */ 627 /* Did we come from a system call? */
619 if (regs->orig_eax >= 0) { 628 if (regs->orig_ax >= 0) {
620 /* Restart the system call - no handlers present */ 629 /* Restart the system call - no handlers present */
621 switch (regs->eax) { 630 switch (regs->ax) {
622 case -ERESTARTNOHAND: 631 case -ERESTARTNOHAND:
623 case -ERESTARTSYS: 632 case -ERESTARTSYS:
624 case -ERESTARTNOINTR: 633 case -ERESTARTNOINTR:
625 regs->eax = regs->orig_eax; 634 regs->ax = regs->orig_ax;
626 regs->eip -= 2; 635 regs->ip -= 2;
627 break; 636 break;
628 637
629 case -ERESTART_RESTARTBLOCK: 638 case -ERESTART_RESTARTBLOCK:
630 regs->eax = __NR_restart_syscall; 639 regs->ax = __NR_restart_syscall;
631 regs->eip -= 2; 640 regs->ip -= 2;
632 break; 641 break;
633 } 642 }
634 } 643 }
@@ -651,13 +660,16 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
651{ 660{
652 /* Pending single-step? */ 661 /* Pending single-step? */
653 if (thread_info_flags & _TIF_SINGLESTEP) { 662 if (thread_info_flags & _TIF_SINGLESTEP) {
654 regs->eflags |= TF_MASK; 663 regs->flags |= TF_MASK;
655 clear_thread_flag(TIF_SINGLESTEP); 664 clear_thread_flag(TIF_SINGLESTEP);
656 } 665 }
657 666
658 /* deal with pending signal delivery */ 667 /* deal with pending signal delivery */
659 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) 668 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
660 do_signal(regs); 669 do_signal(regs);
670
671 if (thread_info_flags & _TIF_HRTICK_RESCHED)
672 hrtick_resched();
661 673
662 clear_thread_flag(TIF_IRET); 674 clear_thread_flag(TIF_IRET);
663} 675}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ab086b0357fc..7347bb14e306 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -39,7 +39,7 @@ asmlinkage long
39sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 39sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
40 struct pt_regs *regs) 40 struct pt_regs *regs)
41{ 41{
42 return do_sigaltstack(uss, uoss, regs->rsp); 42 return do_sigaltstack(uss, uoss, regs->sp);
43} 43}
44 44
45 45
@@ -64,8 +64,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
64 64
65#define COPY(x) err |= __get_user(regs->x, &sc->x) 65#define COPY(x) err |= __get_user(regs->x, &sc->x)
66 66
67 COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); 67 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
68 COPY(rdx); COPY(rcx); COPY(rip); 68 COPY(dx); COPY(cx); COPY(ip);
69 COPY(r8); 69 COPY(r8);
70 COPY(r9); 70 COPY(r9);
71 COPY(r10); 71 COPY(r10);
@@ -86,9 +86,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
86 86
87 { 87 {
88 unsigned int tmpflags; 88 unsigned int tmpflags;
89 err |= __get_user(tmpflags, &sc->eflags); 89 err |= __get_user(tmpflags, &sc->flags);
90 regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); 90 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
91 regs->orig_rax = -1; /* disable syscall checks */ 91 regs->orig_ax = -1; /* disable syscall checks */
92 } 92 }
93 93
94 { 94 {
@@ -108,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
108 } 108 }
109 } 109 }
110 110
111 err |= __get_user(*prax, &sc->rax); 111 err |= __get_user(*prax, &sc->ax);
112 return err; 112 return err;
113 113
114badframe: 114badframe:
@@ -119,9 +119,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
119{ 119{
120 struct rt_sigframe __user *frame; 120 struct rt_sigframe __user *frame;
121 sigset_t set; 121 sigset_t set;
122 unsigned long eax; 122 unsigned long ax;
123 123
124 frame = (struct rt_sigframe __user *)(regs->rsp - 8); 124 frame = (struct rt_sigframe __user *)(regs->sp - 8);
125 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { 125 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
126 goto badframe; 126 goto badframe;
127 } 127 }
@@ -135,17 +135,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
135 recalc_sigpending(); 135 recalc_sigpending();
136 spin_unlock_irq(&current->sighand->siglock); 136 spin_unlock_irq(&current->sighand->siglock);
137 137
138 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) 138 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
139 goto badframe; 139 goto badframe;
140 140
141#ifdef DEBUG_SIG 141#ifdef DEBUG_SIG
142 printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); 142 printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax);
143#endif 143#endif
144 144
145 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) 145 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
146 goto badframe; 146 goto badframe;
147 147
148 return eax; 148 return ax;
149 149
150badframe: 150badframe:
151 signal_fault(regs,frame,"sigreturn"); 151 signal_fault(regs,frame,"sigreturn");
@@ -165,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
165 err |= __put_user(0, &sc->gs); 165 err |= __put_user(0, &sc->gs);
166 err |= __put_user(0, &sc->fs); 166 err |= __put_user(0, &sc->fs);
167 167
168 err |= __put_user(regs->rdi, &sc->rdi); 168 err |= __put_user(regs->di, &sc->di);
169 err |= __put_user(regs->rsi, &sc->rsi); 169 err |= __put_user(regs->si, &sc->si);
170 err |= __put_user(regs->rbp, &sc->rbp); 170 err |= __put_user(regs->bp, &sc->bp);
171 err |= __put_user(regs->rsp, &sc->rsp); 171 err |= __put_user(regs->sp, &sc->sp);
172 err |= __put_user(regs->rbx, &sc->rbx); 172 err |= __put_user(regs->bx, &sc->bx);
173 err |= __put_user(regs->rdx, &sc->rdx); 173 err |= __put_user(regs->dx, &sc->dx);
174 err |= __put_user(regs->rcx, &sc->rcx); 174 err |= __put_user(regs->cx, &sc->cx);
175 err |= __put_user(regs->rax, &sc->rax); 175 err |= __put_user(regs->ax, &sc->ax);
176 err |= __put_user(regs->r8, &sc->r8); 176 err |= __put_user(regs->r8, &sc->r8);
177 err |= __put_user(regs->r9, &sc->r9); 177 err |= __put_user(regs->r9, &sc->r9);
178 err |= __put_user(regs->r10, &sc->r10); 178 err |= __put_user(regs->r10, &sc->r10);
@@ -183,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
183 err |= __put_user(regs->r15, &sc->r15); 183 err |= __put_user(regs->r15, &sc->r15);
184 err |= __put_user(me->thread.trap_no, &sc->trapno); 184 err |= __put_user(me->thread.trap_no, &sc->trapno);
185 err |= __put_user(me->thread.error_code, &sc->err); 185 err |= __put_user(me->thread.error_code, &sc->err);
186 err |= __put_user(regs->rip, &sc->rip); 186 err |= __put_user(regs->ip, &sc->ip);
187 err |= __put_user(regs->eflags, &sc->eflags); 187 err |= __put_user(regs->flags, &sc->flags);
188 err |= __put_user(mask, &sc->oldmask); 188 err |= __put_user(mask, &sc->oldmask);
189 err |= __put_user(me->thread.cr2, &sc->cr2); 189 err |= __put_user(me->thread.cr2, &sc->cr2);
190 190
@@ -198,18 +198,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo
198static void __user * 198static void __user *
199get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) 199get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
200{ 200{
201 unsigned long rsp; 201 unsigned long sp;
202 202
203 /* Default to using normal stack - redzone*/ 203 /* Default to using normal stack - redzone*/
204 rsp = regs->rsp - 128; 204 sp = regs->sp - 128;
205 205
206 /* This is the X/Open sanctioned signal stack switching. */ 206 /* This is the X/Open sanctioned signal stack switching. */
207 if (ka->sa.sa_flags & SA_ONSTACK) { 207 if (ka->sa.sa_flags & SA_ONSTACK) {
208 if (sas_ss_flags(rsp) == 0) 208 if (sas_ss_flags(sp) == 0)
209 rsp = current->sas_ss_sp + current->sas_ss_size; 209 sp = current->sas_ss_sp + current->sas_ss_size;
210 } 210 }
211 211
212 return (void __user *)round_down(rsp - size, 16); 212 return (void __user *)round_down(sp - size, 16);
213} 213}
214 214
215static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 215static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -246,7 +246,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
246 err |= __put_user(0, &frame->uc.uc_flags); 246 err |= __put_user(0, &frame->uc.uc_flags);
247 err |= __put_user(0, &frame->uc.uc_link); 247 err |= __put_user(0, &frame->uc.uc_link);
248 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 248 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
249 err |= __put_user(sas_ss_flags(regs->rsp), 249 err |= __put_user(sas_ss_flags(regs->sp),
250 &frame->uc.uc_stack.ss_flags); 250 &frame->uc.uc_stack.ss_flags);
251 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 251 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
252 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 252 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
@@ -271,21 +271,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
271 goto give_sigsegv; 271 goto give_sigsegv;
272 272
273#ifdef DEBUG_SIG 273#ifdef DEBUG_SIG
274 printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); 274 printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax);
275#endif 275#endif
276 276
277 /* Set up registers for signal handler */ 277 /* Set up registers for signal handler */
278 regs->rdi = sig; 278 regs->di = sig;
279 /* In case the signal handler was declared without prototypes */ 279 /* In case the signal handler was declared without prototypes */
280 regs->rax = 0; 280 regs->ax = 0;
281 281
282 /* This also works for non SA_SIGINFO handlers because they expect the 282 /* This also works for non SA_SIGINFO handlers because they expect the
283 next argument after the signal number on the stack. */ 283 next argument after the signal number on the stack. */
284 regs->rsi = (unsigned long)&frame->info; 284 regs->si = (unsigned long)&frame->info;
285 regs->rdx = (unsigned long)&frame->uc; 285 regs->dx = (unsigned long)&frame->uc;
286 regs->rip = (unsigned long) ka->sa.sa_handler; 286 regs->ip = (unsigned long) ka->sa.sa_handler;
287 287
288 regs->rsp = (unsigned long)frame; 288 regs->sp = (unsigned long)frame;
289 289
290 /* Set up the CS register to run signal handlers in 64-bit mode, 290 /* Set up the CS register to run signal handlers in 64-bit mode,
291 even if the handler happens to be interrupting 32-bit code. */ 291 even if the handler happens to be interrupting 32-bit code. */
@@ -295,12 +295,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
295 see include/asm-x86_64/uaccess.h for details. */ 295 see include/asm-x86_64/uaccess.h for details. */
296 set_fs(USER_DS); 296 set_fs(USER_DS);
297 297
298 regs->eflags &= ~TF_MASK; 298 regs->flags &= ~X86_EFLAGS_TF;
299 if (test_thread_flag(TIF_SINGLESTEP)) 299 if (test_thread_flag(TIF_SINGLESTEP))
300 ptrace_notify(SIGTRAP); 300 ptrace_notify(SIGTRAP);
301#ifdef DEBUG_SIG 301#ifdef DEBUG_SIG
302 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", 302 printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
303 current->comm, current->pid, frame, regs->rip, frame->pretcode); 303 current->comm, current->pid, frame, regs->ip, frame->pretcode);
304#endif 304#endif
305 305
306 return 0; 306 return 0;
@@ -321,44 +321,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
321 int ret; 321 int ret;
322 322
323#ifdef DEBUG_SIG 323#ifdef DEBUG_SIG
324 printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", 324 printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n",
325 current->pid, sig, 325 current->pid, sig,
326 regs->rip, regs->rsp, regs); 326 regs->ip, regs->sp, regs);
327#endif 327#endif
328 328
329 /* Are we from a system call? */ 329 /* Are we from a system call? */
330 if ((long)regs->orig_rax >= 0) { 330 if ((long)regs->orig_ax >= 0) {
331 /* If so, check system call restarting.. */ 331 /* If so, check system call restarting.. */
332 switch (regs->rax) { 332 switch (regs->ax) {
333 case -ERESTART_RESTARTBLOCK: 333 case -ERESTART_RESTARTBLOCK:
334 case -ERESTARTNOHAND: 334 case -ERESTARTNOHAND:
335 regs->rax = -EINTR; 335 regs->ax = -EINTR;
336 break; 336 break;
337 337
338 case -ERESTARTSYS: 338 case -ERESTARTSYS:
339 if (!(ka->sa.sa_flags & SA_RESTART)) { 339 if (!(ka->sa.sa_flags & SA_RESTART)) {
340 regs->rax = -EINTR; 340 regs->ax = -EINTR;
341 break; 341 break;
342 } 342 }
343 /* fallthrough */ 343 /* fallthrough */
344 case -ERESTARTNOINTR: 344 case -ERESTARTNOINTR:
345 regs->rax = regs->orig_rax; 345 regs->ax = regs->orig_ax;
346 regs->rip -= 2; 346 regs->ip -= 2;
347 break; 347 break;
348 } 348 }
349 } 349 }
350 350
351 /* 351 /*
352 * If TF is set due to a debugger (PT_DTRACE), clear the TF 352 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
353 * flag so that register information in the sigcontext is 353 * flag so that register information in the sigcontext is correct.
354 * correct.
355 */ 354 */
356 if (unlikely(regs->eflags & TF_MASK)) { 355 if (unlikely(regs->flags & X86_EFLAGS_TF) &&
357 if (likely(current->ptrace & PT_DTRACE)) { 356 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
358 current->ptrace &= ~PT_DTRACE; 357 regs->flags &= ~X86_EFLAGS_TF;
359 regs->eflags &= ~TF_MASK;
360 }
361 }
362 358
363#ifdef CONFIG_IA32_EMULATION 359#ifdef CONFIG_IA32_EMULATION
364 if (test_thread_flag(TIF_IA32)) { 360 if (test_thread_flag(TIF_IA32)) {
@@ -430,21 +426,21 @@ static void do_signal(struct pt_regs *regs)
430 } 426 }
431 427
432 /* Did we come from a system call? */ 428 /* Did we come from a system call? */
433 if ((long)regs->orig_rax >= 0) { 429 if ((long)regs->orig_ax >= 0) {
434 /* Restart the system call - no handlers present */ 430 /* Restart the system call - no handlers present */
435 long res = regs->rax; 431 long res = regs->ax;
436 switch (res) { 432 switch (res) {
437 case -ERESTARTNOHAND: 433 case -ERESTARTNOHAND:
438 case -ERESTARTSYS: 434 case -ERESTARTSYS:
439 case -ERESTARTNOINTR: 435 case -ERESTARTNOINTR:
440 regs->rax = regs->orig_rax; 436 regs->ax = regs->orig_ax;
441 regs->rip -= 2; 437 regs->ip -= 2;
442 break; 438 break;
443 case -ERESTART_RESTARTBLOCK: 439 case -ERESTART_RESTARTBLOCK:
444 regs->rax = test_thread_flag(TIF_IA32) ? 440 regs->ax = test_thread_flag(TIF_IA32) ?
445 __NR_ia32_restart_syscall : 441 __NR_ia32_restart_syscall :
446 __NR_restart_syscall; 442 __NR_restart_syscall;
447 regs->rip -= 2; 443 regs->ip -= 2;
448 break; 444 break;
449 } 445 }
450 } 446 }
@@ -461,13 +457,13 @@ void
461do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 457do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
462{ 458{
463#ifdef DEBUG_SIG 459#ifdef DEBUG_SIG
464 printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", 460 printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n",
465 thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); 461 thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current));
466#endif 462#endif
467 463
468 /* Pending single-step? */ 464 /* Pending single-step? */
469 if (thread_info_flags & _TIF_SINGLESTEP) { 465 if (thread_info_flags & _TIF_SINGLESTEP) {
470 regs->eflags |= TF_MASK; 466 regs->flags |= X86_EFLAGS_TF;
471 clear_thread_flag(TIF_SINGLESTEP); 467 clear_thread_flag(TIF_SINGLESTEP);
472 } 468 }
473 469
@@ -480,14 +476,20 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
480 /* deal with pending signal delivery */ 476 /* deal with pending signal delivery */
481 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) 477 if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
482 do_signal(regs); 478 do_signal(regs);
479
480 if (thread_info_flags & _TIF_HRTICK_RESCHED)
481 hrtick_resched();
483} 482}
484 483
485void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 484void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
486{ 485{
487 struct task_struct *me = current; 486 struct task_struct *me = current;
488 if (show_unhandled_signals && printk_ratelimit()) 487 if (show_unhandled_signals && printk_ratelimit()) {
489 printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", 488 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
490 me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); 489 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
490 print_vma_addr(" in ", regs->ip);
491 printk("\n");
492 }
491 493
492 force_sig(SIGSEGV, me); 494 force_sig(SIGSEGV, me);
493} 495}
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
index fcaa026eb807..dc0cde9d16fb 100644
--- a/arch/x86/kernel/smp_32.c
+++ b/arch/x86/kernel/smp_32.c
@@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
159 apic_write_around(APIC_ICR, cfg); 159 apic_write_around(APIC_ICR, cfg);
160} 160}
161 161
162void fastcall send_IPI_self(int vector) 162void send_IPI_self(int vector)
163{ 163{
164 __send_IPI_shortcut(APIC_DEST_SELF, vector); 164 __send_IPI_shortcut(APIC_DEST_SELF, vector);
165} 165}
@@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
223 */ 223 */
224 224
225 local_irq_save(flags); 225 local_irq_save(flags);
226 for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { 226 for_each_possible_cpu(query_cpu) {
227 if (cpu_isset(query_cpu, mask)) { 227 if (cpu_isset(query_cpu, mask)) {
228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), 228 __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
229 vector); 229 vector);
@@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock);
256 * We need to reload %cr3 since the page tables may be going 256 * We need to reload %cr3 since the page tables may be going
257 * away from under us.. 257 * away from under us..
258 */ 258 */
259void leave_mm(unsigned long cpu) 259void leave_mm(int cpu)
260{ 260{
261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) 261 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
262 BUG(); 262 BUG();
263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); 263 cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
264 load_cr3(swapper_pg_dir); 264 load_cr3(swapper_pg_dir);
265} 265}
266EXPORT_SYMBOL_GPL(leave_mm);
266 267
267/* 268/*
268 * 269 *
@@ -310,7 +311,7 @@ void leave_mm(unsigned long cpu)
310 * 2) Leave the mm if we are in the lazy tlb mode. 311 * 2) Leave the mm if we are in the lazy tlb mode.
311 */ 312 */
312 313
313fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 314void smp_invalidate_interrupt(struct pt_regs *regs)
314{ 315{
315 unsigned long cpu; 316 unsigned long cpu;
316 317
@@ -638,13 +639,13 @@ static void native_smp_send_stop(void)
638 * all the work is done automatically when 639 * all the work is done automatically when
639 * we return from the interrupt. 640 * we return from the interrupt.
640 */ 641 */
641fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 642void smp_reschedule_interrupt(struct pt_regs *regs)
642{ 643{
643 ack_APIC_irq(); 644 ack_APIC_irq();
644 __get_cpu_var(irq_stat).irq_resched_count++; 645 __get_cpu_var(irq_stat).irq_resched_count++;
645} 646}
646 647
647fastcall void smp_call_function_interrupt(struct pt_regs *regs) 648void smp_call_function_interrupt(struct pt_regs *regs)
648{ 649{
649 void (*func) (void *info) = call_data->func; 650 void (*func) (void *info) = call_data->func;
650 void *info = call_data->info; 651 void *info = call_data->info;
@@ -675,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id)
675{ 676{
676 int i; 677 int i;
677 678
678 for (i = 0; i < NR_CPUS; i++) { 679 for_each_possible_cpu(i) {
679 if (per_cpu(x86_cpu_to_apicid, i) == apic_id) 680 if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
680 return i; 681 return i;
681 } 682 }
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c
index 03fa6ed559c6..2fd74b06db67 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/smp_64.c
@@ -29,7 +29,7 @@
29#include <asm/idle.h> 29#include <asm/idle.h>
30 30
31/* 31/*
32 * Smarter SMP flushing macros. 32 * Smarter SMP flushing macros.
33 * c/o Linus Torvalds. 33 * c/o Linus Torvalds.
34 * 34 *
35 * These mean you can really definitely utterly forget about 35 * These mean you can really definitely utterly forget about
@@ -37,15 +37,15 @@
37 * 37 *
38 * Optimizations Manfred Spraul <manfred@colorfullife.com> 38 * Optimizations Manfred Spraul <manfred@colorfullife.com>
39 * 39 *
40 * More scalable flush, from Andi Kleen 40 * More scalable flush, from Andi Kleen
41 * 41 *
42 * To avoid global state use 8 different call vectors. 42 * To avoid global state use 8 different call vectors.
43 * Each CPU uses a specific vector to trigger flushes on other 43 * Each CPU uses a specific vector to trigger flushes on other
44 * CPUs. Depending on the received vector the target CPUs look into 44 * CPUs. Depending on the received vector the target CPUs look into
45 * the right per cpu variable for the flush data. 45 * the right per cpu variable for the flush data.
46 * 46 *
47 * With more than 8 CPUs they are hashed to the 8 available 47 * With more than 8 CPUs they are hashed to the 8 available
48 * vectors. The limited global vector space forces us to this right now. 48 * vectors. The limited global vector space forces us to this right now.
49 * In future when interrupts are split into per CPU domains this could be 49 * In future when interrupts are split into per CPU domains this could be
50 * fixed, at the cost of triggering multiple IPIs in some cases. 50 * fixed, at the cost of triggering multiple IPIs in some cases.
51 */ 51 */
@@ -55,7 +55,6 @@ union smp_flush_state {
55 cpumask_t flush_cpumask; 55 cpumask_t flush_cpumask;
56 struct mm_struct *flush_mm; 56 struct mm_struct *flush_mm;
57 unsigned long flush_va; 57 unsigned long flush_va;
58#define FLUSH_ALL -1ULL
59 spinlock_t tlbstate_lock; 58 spinlock_t tlbstate_lock;
60 }; 59 };
61 char pad[SMP_CACHE_BYTES]; 60 char pad[SMP_CACHE_BYTES];
@@ -67,16 +66,17 @@ union smp_flush_state {
67static DEFINE_PER_CPU(union smp_flush_state, flush_state); 66static DEFINE_PER_CPU(union smp_flush_state, flush_state);
68 67
69/* 68/*
70 * We cannot call mmdrop() because we are in interrupt context, 69 * We cannot call mmdrop() because we are in interrupt context,
71 * instead update mm->cpu_vm_mask. 70 * instead update mm->cpu_vm_mask.
72 */ 71 */
73static inline void leave_mm(int cpu) 72void leave_mm(int cpu)
74{ 73{
75 if (read_pda(mmu_state) == TLBSTATE_OK) 74 if (read_pda(mmu_state) == TLBSTATE_OK)
76 BUG(); 75 BUG();
77 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); 76 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
78 load_cr3(swapper_pg_dir); 77 load_cr3(swapper_pg_dir);
79} 78}
79EXPORT_SYMBOL_GPL(leave_mm);
80 80
81/* 81/*
82 * 82 *
@@ -85,25 +85,25 @@ static inline void leave_mm(int cpu)
85 * 1) switch_mm() either 1a) or 1b) 85 * 1) switch_mm() either 1a) or 1b)
86 * 1a) thread switch to a different mm 86 * 1a) thread switch to a different mm
87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); 87 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
88 * Stop ipi delivery for the old mm. This is not synchronized with 88 * Stop ipi delivery for the old mm. This is not synchronized with
89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis 89 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
90 * for the wrong mm, and in the worst case we perform a superfluous 90 * for the wrong mm, and in the worst case we perform a superfluous
91 * tlb flush. 91 * tlb flush.
92 * 1a2) set cpu mmu_state to TLBSTATE_OK 92 * 1a2) set cpu mmu_state to TLBSTATE_OK
93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 93 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
94 * was in lazy tlb mode. 94 * was in lazy tlb mode.
95 * 1a3) update cpu active_mm 95 * 1a3) update cpu active_mm
96 * Now cpu0 accepts tlb flushes for the new mm. 96 * Now cpu0 accepts tlb flushes for the new mm.
97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); 97 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
98 * Now the other cpus will send tlb flush ipis. 98 * Now the other cpus will send tlb flush ipis.
99 * 1a4) change cr3. 99 * 1a4) change cr3.
100 * 1b) thread switch without mm change 100 * 1b) thread switch without mm change
101 * cpu active_mm is correct, cpu0 already handles 101 * cpu active_mm is correct, cpu0 already handles
102 * flush ipis. 102 * flush ipis.
103 * 1b1) set cpu mmu_state to TLBSTATE_OK 103 * 1b1) set cpu mmu_state to TLBSTATE_OK
104 * 1b2) test_and_set the cpu bit in cpu_vm_mask. 104 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
105 * Atomically set the bit [other cpus will start sending flush ipis], 105 * Atomically set the bit [other cpus will start sending flush ipis],
106 * and test the bit. 106 * and test the bit.
107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb. 107 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
108 * 2) switch %%esp, ie current 108 * 2) switch %%esp, ie current
109 * 109 *
@@ -137,12 +137,12 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
137 * orig_rax contains the negated interrupt vector. 137 * orig_rax contains the negated interrupt vector.
138 * Use that to determine where the sender put the data. 138 * Use that to determine where the sender put the data.
139 */ 139 */
140 sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; 140 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
141 f = &per_cpu(flush_state, sender); 141 f = &per_cpu(flush_state, sender);
142 142
143 if (!cpu_isset(cpu, f->flush_cpumask)) 143 if (!cpu_isset(cpu, f->flush_cpumask))
144 goto out; 144 goto out;
145 /* 145 /*
146 * This was a BUG() but until someone can quote me the 146 * This was a BUG() but until someone can quote me the
147 * line from the intel manual that guarantees an IPI to 147 * line from the intel manual that guarantees an IPI to
148 * multiple CPUs is retried _only_ on the erroring CPUs 148 * multiple CPUs is retried _only_ on the erroring CPUs
@@ -150,10 +150,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
150 * 150 *
151 * BUG(); 151 * BUG();
152 */ 152 */
153 153
154 if (f->flush_mm == read_pda(active_mm)) { 154 if (f->flush_mm == read_pda(active_mm)) {
155 if (read_pda(mmu_state) == TLBSTATE_OK) { 155 if (read_pda(mmu_state) == TLBSTATE_OK) {
156 if (f->flush_va == FLUSH_ALL) 156 if (f->flush_va == TLB_FLUSH_ALL)
157 local_flush_tlb(); 157 local_flush_tlb();
158 else 158 else
159 __flush_tlb_one(f->flush_va); 159 __flush_tlb_one(f->flush_va);
@@ -166,19 +166,22 @@ out:
166 add_pda(irq_tlb_count, 1); 166 add_pda(irq_tlb_count, 1);
167} 167}
168 168
169static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 169void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
170 unsigned long va) 170 unsigned long va)
171{ 171{
172 int sender; 172 int sender;
173 union smp_flush_state *f; 173 union smp_flush_state *f;
174 cpumask_t cpumask = *cpumaskp;
174 175
175 /* Caller has disabled preemption */ 176 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 177 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
177 f = &per_cpu(flush_state, sender); 178 f = &per_cpu(flush_state, sender);
178 179
179 /* Could avoid this lock when 180 /*
180 num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * Could avoid this lock when
181 probably not worth checking this for a cache-hot lock. */ 182 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
183 * probably not worth checking this for a cache-hot lock.
184 */
182 spin_lock(&f->tlbstate_lock); 185 spin_lock(&f->tlbstate_lock);
183 186
184 f->flush_mm = mm; 187 f->flush_mm = mm;
@@ -202,14 +205,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
202int __cpuinit init_smp_flush(void) 205int __cpuinit init_smp_flush(void)
203{ 206{
204 int i; 207 int i;
208
205 for_each_cpu_mask(i, cpu_possible_map) { 209 for_each_cpu_mask(i, cpu_possible_map) {
206 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); 210 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
207 } 211 }
208 return 0; 212 return 0;
209} 213}
210
211core_initcall(init_smp_flush); 214core_initcall(init_smp_flush);
212 215
213void flush_tlb_current_task(void) 216void flush_tlb_current_task(void)
214{ 217{
215 struct mm_struct *mm = current->mm; 218 struct mm_struct *mm = current->mm;
@@ -221,10 +224,9 @@ void flush_tlb_current_task(void)
221 224
222 local_flush_tlb(); 225 local_flush_tlb();
223 if (!cpus_empty(cpu_mask)) 226 if (!cpus_empty(cpu_mask))
224 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 227 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
225 preempt_enable(); 228 preempt_enable();
226} 229}
227EXPORT_SYMBOL(flush_tlb_current_task);
228 230
229void flush_tlb_mm (struct mm_struct * mm) 231void flush_tlb_mm (struct mm_struct * mm)
230{ 232{
@@ -241,11 +243,10 @@ void flush_tlb_mm (struct mm_struct * mm)
241 leave_mm(smp_processor_id()); 243 leave_mm(smp_processor_id());
242 } 244 }
243 if (!cpus_empty(cpu_mask)) 245 if (!cpus_empty(cpu_mask))
244 flush_tlb_others(cpu_mask, mm, FLUSH_ALL); 246 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
245 247
246 preempt_enable(); 248 preempt_enable();
247} 249}
248EXPORT_SYMBOL(flush_tlb_mm);
249 250
250void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) 251void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
251{ 252{
@@ -259,8 +260,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
259 if (current->active_mm == mm) { 260 if (current->active_mm == mm) {
260 if(current->mm) 261 if(current->mm)
261 __flush_tlb_one(va); 262 __flush_tlb_one(va);
262 else 263 else
263 leave_mm(smp_processor_id()); 264 leave_mm(smp_processor_id());
264 } 265 }
265 266
266 if (!cpus_empty(cpu_mask)) 267 if (!cpus_empty(cpu_mask))
@@ -268,7 +269,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
268 269
269 preempt_enable(); 270 preempt_enable();
270} 271}
271EXPORT_SYMBOL(flush_tlb_page);
272 272
273static void do_flush_tlb_all(void* info) 273static void do_flush_tlb_all(void* info)
274{ 274{
@@ -325,11 +325,9 @@ void unlock_ipi_call_lock(void)
325 * this function sends a 'generic call function' IPI to all other CPU 325 * this function sends a 'generic call function' IPI to all other CPU
326 * of the system defined in the mask. 326 * of the system defined in the mask.
327 */ 327 */
328 328static int __smp_call_function_mask(cpumask_t mask,
329static int 329 void (*func)(void *), void *info,
330__smp_call_function_mask(cpumask_t mask, 330 int wait)
331 void (*func)(void *), void *info,
332 int wait)
333{ 331{
334 struct call_data_struct data; 332 struct call_data_struct data;
335 cpumask_t allbutself; 333 cpumask_t allbutself;
@@ -417,11 +415,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
417 */ 415 */
418 416
419int smp_call_function_single (int cpu, void (*func) (void *info), void *info, 417int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
420 int nonatomic, int wait) 418 int nonatomic, int wait)
421{ 419{
422 /* prevent preemption and reschedule on another processor */ 420 /* prevent preemption and reschedule on another processor */
423 int ret; 421 int ret, me = get_cpu();
424 int me = get_cpu();
425 422
426 /* Can deadlock when called with interrupts disabled */ 423 /* Can deadlock when called with interrupts disabled */
427 WARN_ON(irqs_disabled()); 424 WARN_ON(irqs_disabled());
@@ -471,9 +468,9 @@ static void stop_this_cpu(void *dummy)
471 */ 468 */
472 cpu_clear(smp_processor_id(), cpu_online_map); 469 cpu_clear(smp_processor_id(), cpu_online_map);
473 disable_local_APIC(); 470 disable_local_APIC();
474 for (;;) 471 for (;;)
475 halt(); 472 halt();
476} 473}
477 474
478void smp_send_stop(void) 475void smp_send_stop(void)
479{ 476{
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
index 4ea80cbe52e5..579b9b740c7c 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map);
83 83
84cpumask_t cpu_callin_map; 84cpumask_t cpu_callin_map;
85cpumask_t cpu_callout_map; 85cpumask_t cpu_callout_map;
86EXPORT_SYMBOL(cpu_callout_map);
87cpumask_t cpu_possible_map; 86cpumask_t cpu_possible_map;
88EXPORT_SYMBOL(cpu_possible_map); 87EXPORT_SYMBOL(cpu_possible_map);
89static cpumask_t smp_commenced_mask; 88static cpumask_t smp_commenced_mask;
@@ -92,15 +91,10 @@ static cpumask_t smp_commenced_mask;
92DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 91DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
93EXPORT_PER_CPU_SYMBOL(cpu_info); 92EXPORT_PER_CPU_SYMBOL(cpu_info);
94 93
95/* 94/* which logical CPU number maps to which CPU (physical APIC ID) */
96 * The following static array is used during kernel startup
97 * and the x86_cpu_to_apicid_ptr contains the address of the
98 * array during this time. Is it zeroed when the per_cpu
99 * data area is removed.
100 */
101u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = 95u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
102 { [0 ... NR_CPUS-1] = BAD_APICID }; 96 { [0 ... NR_CPUS-1] = BAD_APICID };
103void *x86_cpu_to_apicid_ptr; 97void *x86_cpu_to_apicid_early_ptr;
104DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; 98DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
105EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); 99EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
106 100
@@ -113,7 +107,6 @@ u8 apicid_2_node[MAX_APICID];
113extern const unsigned char trampoline_data []; 107extern const unsigned char trampoline_data [];
114extern const unsigned char trampoline_end []; 108extern const unsigned char trampoline_end [];
115static unsigned char *trampoline_base; 109static unsigned char *trampoline_base;
116static int trampoline_exec;
117 110
118static void map_cpu_to_logical_apicid(void); 111static void map_cpu_to_logical_apicid(void);
119 112
@@ -138,17 +131,13 @@ static unsigned long __cpuinit setup_trampoline(void)
138 */ 131 */
139void __init smp_alloc_memory(void) 132void __init smp_alloc_memory(void)
140{ 133{
141 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); 134 trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
142 /* 135 /*
143 * Has to be in very low memory so we can execute 136 * Has to be in very low memory so we can execute
144 * real-mode AP code. 137 * real-mode AP code.
145 */ 138 */
146 if (__pa(trampoline_base) >= 0x9F000) 139 if (__pa(trampoline_base) >= 0x9F000)
147 BUG(); 140 BUG();
148 /*
149 * Make the SMP trampoline executable:
150 */
151 trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
152} 141}
153 142
154/* 143/*
@@ -213,8 +202,6 @@ valid_k7:
213 ; 202 ;
214} 203}
215 204
216extern void calibrate_delay(void);
217
218static atomic_t init_deasserted; 205static atomic_t init_deasserted;
219 206
220static void __cpuinit smp_callin(void) 207static void __cpuinit smp_callin(void)
@@ -405,7 +392,7 @@ static void __cpuinit start_secondary(void *unused)
405 setup_secondary_clock(); 392 setup_secondary_clock();
406 if (nmi_watchdog == NMI_IO_APIC) { 393 if (nmi_watchdog == NMI_IO_APIC) {
407 disable_8259A_irq(0); 394 disable_8259A_irq(0);
408 enable_NMI_through_LVT0(NULL); 395 enable_NMI_through_LVT0();
409 enable_8259A_irq(0); 396 enable_8259A_irq(0);
410 } 397 }
411 /* 398 /*
@@ -448,38 +435,38 @@ void __devinit initialize_secondary(void)
448{ 435{
449 /* 436 /*
450 * We don't actually need to load the full TSS, 437 * We don't actually need to load the full TSS,
451 * basically just the stack pointer and the eip. 438 * basically just the stack pointer and the ip.
452 */ 439 */
453 440
454 asm volatile( 441 asm volatile(
455 "movl %0,%%esp\n\t" 442 "movl %0,%%esp\n\t"
456 "jmp *%1" 443 "jmp *%1"
457 : 444 :
458 :"m" (current->thread.esp),"m" (current->thread.eip)); 445 :"m" (current->thread.sp),"m" (current->thread.ip));
459} 446}
460 447
461/* Static state in head.S used to set up a CPU */ 448/* Static state in head.S used to set up a CPU */
462extern struct { 449extern struct {
463 void * esp; 450 void * sp;
464 unsigned short ss; 451 unsigned short ss;
465} stack_start; 452} stack_start;
466 453
467#ifdef CONFIG_NUMA 454#ifdef CONFIG_NUMA
468 455
469/* which logical CPUs are on which nodes */ 456/* which logical CPUs are on which nodes */
470cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = 457cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
471 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; 458 { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
472EXPORT_SYMBOL(node_2_cpu_mask); 459EXPORT_SYMBOL(node_to_cpumask_map);
473/* which node each logical CPU is on */ 460/* which node each logical CPU is on */
474int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; 461int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
475EXPORT_SYMBOL(cpu_2_node); 462EXPORT_SYMBOL(cpu_to_node_map);
476 463
477/* set up a mapping between cpu and node. */ 464/* set up a mapping between cpu and node. */
478static inline void map_cpu_to_node(int cpu, int node) 465static inline void map_cpu_to_node(int cpu, int node)
479{ 466{
480 printk("Mapping cpu %d to node %d\n", cpu, node); 467 printk("Mapping cpu %d to node %d\n", cpu, node);
481 cpu_set(cpu, node_2_cpu_mask[node]); 468 cpu_set(cpu, node_to_cpumask_map[node]);
482 cpu_2_node[cpu] = node; 469 cpu_to_node_map[cpu] = node;
483} 470}
484 471
485/* undo a mapping between cpu and node. */ 472/* undo a mapping between cpu and node. */
@@ -489,8 +476,8 @@ static inline void unmap_cpu_to_node(int cpu)
489 476
490 printk("Unmapping cpu %d from all nodes\n", cpu); 477 printk("Unmapping cpu %d from all nodes\n", cpu);
491 for (node = 0; node < MAX_NUMNODES; node ++) 478 for (node = 0; node < MAX_NUMNODES; node ++)
492 cpu_clear(cpu, node_2_cpu_mask[node]); 479 cpu_clear(cpu, node_to_cpumask_map[node]);
493 cpu_2_node[cpu] = 0; 480 cpu_to_node_map[cpu] = 0;
494} 481}
495#else /* !CONFIG_NUMA */ 482#else /* !CONFIG_NUMA */
496 483
@@ -668,7 +655,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
668 * target processor state. 655 * target processor state.
669 */ 656 */
670 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 657 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
671 (unsigned long) stack_start.esp); 658 (unsigned long) stack_start.sp);
672 659
673 /* 660 /*
674 * Run STARTUP IPI loop. 661 * Run STARTUP IPI loop.
@@ -754,7 +741,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu)
754 /* initialize thread_struct. we really want to avoid destroy 741 /* initialize thread_struct. we really want to avoid destroy
755 * idle tread 742 * idle tread
756 */ 743 */
757 idle->thread.esp = (unsigned long)task_pt_regs(idle); 744 idle->thread.sp = (unsigned long)task_pt_regs(idle);
758 init_idle(idle, cpu); 745 init_idle(idle, cpu);
759 return idle; 746 return idle;
760 } 747 }
@@ -799,7 +786,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
799 per_cpu(current_task, cpu) = idle; 786 per_cpu(current_task, cpu) = idle;
800 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 787 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
801 788
802 idle->thread.eip = (unsigned long) start_secondary; 789 idle->thread.ip = (unsigned long) start_secondary;
803 /* start_eip had better be page-aligned! */ 790 /* start_eip had better be page-aligned! */
804 start_eip = setup_trampoline(); 791 start_eip = setup_trampoline();
805 792
@@ -807,9 +794,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
807 alternatives_smp_switch(1); 794 alternatives_smp_switch(1);
808 795
809 /* So we see what's up */ 796 /* So we see what's up */
810 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 797 printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
811 /* Stack for startup_32 can be just as for start_secondary onwards */ 798 /* Stack for startup_32 can be just as for start_secondary onwards */
812 stack_start.esp = (void *) idle->thread.esp; 799 stack_start.sp = (void *) idle->thread.sp;
813 800
814 irq_ctx_init(cpu); 801 irq_ctx_init(cpu);
815 802
@@ -1091,7 +1078,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1091 * Allow the user to impress friends. 1078 * Allow the user to impress friends.
1092 */ 1079 */
1093 Dprintk("Before bogomips.\n"); 1080 Dprintk("Before bogomips.\n");
1094 for (cpu = 0; cpu < NR_CPUS; cpu++) 1081 for_each_possible_cpu(cpu)
1095 if (cpu_isset(cpu, cpu_callout_map)) 1082 if (cpu_isset(cpu, cpu_callout_map))
1096 bogosum += cpu_data(cpu).loops_per_jiffy; 1083 bogosum += cpu_data(cpu).loops_per_jiffy;
1097 printk(KERN_INFO 1084 printk(KERN_INFO
@@ -1122,7 +1109,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1122 * construct cpu_sibling_map, so that we can tell sibling CPUs 1109 * construct cpu_sibling_map, so that we can tell sibling CPUs
1123 * efficiently. 1110 * efficiently.
1124 */ 1111 */
1125 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1112 for_each_possible_cpu(cpu) {
1126 cpus_clear(per_cpu(cpu_sibling_map, cpu)); 1113 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1127 cpus_clear(per_cpu(cpu_core_map, cpu)); 1114 cpus_clear(per_cpu(cpu_core_map, cpu));
1128 } 1115 }
@@ -1296,12 +1283,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1296 setup_ioapic_dest(); 1283 setup_ioapic_dest();
1297#endif 1284#endif
1298 zap_low_mappings(); 1285 zap_low_mappings();
1299#ifndef CONFIG_HOTPLUG_CPU
1300 /*
1301 * Disable executability of the SMP trampoline:
1302 */
1303 set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
1304#endif
1305} 1286}
1306 1287
1307void __init smp_intr_init(void) 1288void __init smp_intr_init(void)
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index aaf4e1291217..d53bd6fcb428 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -65,7 +65,7 @@ int smp_num_siblings = 1;
65EXPORT_SYMBOL(smp_num_siblings); 65EXPORT_SYMBOL(smp_num_siblings);
66 66
67/* Last level cache ID of each logical CPU */ 67/* Last level cache ID of each logical CPU */
68DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; 68DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
69 69
70/* Bitmask of currently online CPUs */ 70/* Bitmask of currently online CPUs */
71cpumask_t cpu_online_map __read_mostly; 71cpumask_t cpu_online_map __read_mostly;
@@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map);
78 */ 78 */
79cpumask_t cpu_callin_map; 79cpumask_t cpu_callin_map;
80cpumask_t cpu_callout_map; 80cpumask_t cpu_callout_map;
81EXPORT_SYMBOL(cpu_callout_map);
82
83cpumask_t cpu_possible_map; 81cpumask_t cpu_possible_map;
84EXPORT_SYMBOL(cpu_possible_map); 82EXPORT_SYMBOL(cpu_possible_map);
85 83
@@ -113,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
113 * a new thread. Also avoids complicated thread destroy functionality 111 * a new thread. Also avoids complicated thread destroy functionality
114 * for idle threads. 112 * for idle threads.
115 */ 113 */
114#ifdef CONFIG_HOTPLUG_CPU
115/*
116 * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
117 * removed after init for !CONFIG_HOTPLUG_CPU.
118 */
119static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
120#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
121#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p))
122#else
116struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 123struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
117
118#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 124#define get_idle_for_cpu(x) (idle_thread_array[(x)])
119#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) 125#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
126#endif
127
120 128
121/* 129/*
122 * Currently trivial. Write the real->protected mode 130 * Currently trivial. Write the real->protected mode
@@ -212,6 +220,7 @@ void __cpuinit smp_callin(void)
212 220
213 Dprintk("CALLIN, before setup_local_APIC().\n"); 221 Dprintk("CALLIN, before setup_local_APIC().\n");
214 setup_local_APIC(); 222 setup_local_APIC();
223 end_local_APIC_setup();
215 224
216 /* 225 /*
217 * Get our bogomips. 226 * Get our bogomips.
@@ -338,7 +347,7 @@ void __cpuinit start_secondary(void)
338 347
339 if (nmi_watchdog == NMI_IO_APIC) { 348 if (nmi_watchdog == NMI_IO_APIC) {
340 disable_8259A_irq(0); 349 disable_8259A_irq(0);
341 enable_NMI_through_LVT0(NULL); 350 enable_NMI_through_LVT0();
342 enable_8259A_irq(0); 351 enable_8259A_irq(0);
343 } 352 }
344 353
@@ -370,7 +379,7 @@ void __cpuinit start_secondary(void)
370 379
371 unlock_ipi_call_lock(); 380 unlock_ipi_call_lock();
372 381
373 setup_secondary_APIC_clock(); 382 setup_secondary_clock();
374 383
375 cpu_idle(); 384 cpu_idle();
376} 385}
@@ -384,19 +393,20 @@ static void inquire_remote_apic(int apicid)
384 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 393 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
385 char *names[] = { "ID", "VERSION", "SPIV" }; 394 char *names[] = { "ID", "VERSION", "SPIV" };
386 int timeout; 395 int timeout;
387 unsigned int status; 396 u32 status;
388 397
389 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 398 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
390 399
391 for (i = 0; i < ARRAY_SIZE(regs); i++) { 400 for (i = 0; i < ARRAY_SIZE(regs); i++) {
392 printk("... APIC #%d %s: ", apicid, names[i]); 401 printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
393 402
394 /* 403 /*
395 * Wait for idle. 404 * Wait for idle.
396 */ 405 */
397 status = safe_apic_wait_icr_idle(); 406 status = safe_apic_wait_icr_idle();
398 if (status) 407 if (status)
399 printk("a previous APIC delivery may have failed\n"); 408 printk(KERN_CONT
409 "a previous APIC delivery may have failed\n");
400 410
401 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 411 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
402 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); 412 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -410,10 +420,10 @@ static void inquire_remote_apic(int apicid)
410 switch (status) { 420 switch (status) {
411 case APIC_ICR_RR_VALID: 421 case APIC_ICR_RR_VALID:
412 status = apic_read(APIC_RRR); 422 status = apic_read(APIC_RRR);
413 printk("%08x\n", status); 423 printk(KERN_CONT "%08x\n", status);
414 break; 424 break;
415 default: 425 default:
416 printk("failed\n"); 426 printk(KERN_CONT "failed\n");
417 } 427 }
418 } 428 }
419} 429}
@@ -466,7 +476,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
466 */ 476 */
467 Dprintk("#startup loops: %d.\n", num_starts); 477 Dprintk("#startup loops: %d.\n", num_starts);
468 478
469 maxlvt = get_maxlvt(); 479 maxlvt = lapic_get_maxlvt();
470 480
471 for (j = 1; j <= num_starts; j++) { 481 for (j = 1; j <= num_starts; j++) {
472 Dprintk("Sending STARTUP #%d.\n",j); 482 Dprintk("Sending STARTUP #%d.\n",j);
@@ -577,7 +587,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
577 c_idle.idle = get_idle_for_cpu(cpu); 587 c_idle.idle = get_idle_for_cpu(cpu);
578 588
579 if (c_idle.idle) { 589 if (c_idle.idle) {
580 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) 590 c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
581 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); 591 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
582 init_idle(c_idle.idle, cpu); 592 init_idle(c_idle.idle, cpu);
583 goto do_rest; 593 goto do_rest;
@@ -613,8 +623,8 @@ do_rest:
613 623
614 start_rip = setup_trampoline(); 624 start_rip = setup_trampoline();
615 625
616 init_rsp = c_idle.idle->thread.rsp; 626 init_rsp = c_idle.idle->thread.sp;
617 per_cpu(init_tss,cpu).rsp0 = init_rsp; 627 load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
618 initial_code = start_secondary; 628 initial_code = start_secondary;
619 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 629 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
620 630
@@ -691,7 +701,7 @@ do_rest:
691 } 701 }
692 if (boot_error) { 702 if (boot_error) {
693 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ 703 cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
694 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 704 clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
695 clear_node_cpumask(cpu); /* was set by numa_add_cpu */ 705 clear_node_cpumask(cpu); /* was set by numa_add_cpu */
696 cpu_clear(cpu, cpu_present_map); 706 cpu_clear(cpu, cpu_present_map);
697 cpu_clear(cpu, cpu_possible_map); 707 cpu_clear(cpu, cpu_possible_map);
@@ -841,24 +851,16 @@ static int __init smp_sanity_check(unsigned max_cpus)
841 return 0; 851 return 0;
842} 852}
843 853
844/* 854static void __init smp_cpu_index_default(void)
845 * Copy apicid's found by MP_processor_info from initial array to the per cpu
846 * data area. The x86_cpu_to_apicid_init array is then expendable and the
847 * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no
848 * longer available.
849 */
850void __init smp_set_apicids(void)
851{ 855{
852 int cpu; 856 int i;
857 struct cpuinfo_x86 *c;
853 858
854 for_each_cpu_mask(cpu, cpu_possible_map) { 859 for_each_cpu_mask(i, cpu_possible_map) {
855 if (per_cpu_offset(cpu)) 860 c = &cpu_data(i);
856 per_cpu(x86_cpu_to_apicid, cpu) = 861 /* mark all to hotplug */
857 x86_cpu_to_apicid_init[cpu]; 862 c->cpu_index = NR_CPUS;
858 } 863 }
859
860 /* indicate the static array will be going away soon */
861 x86_cpu_to_apicid_ptr = NULL;
862} 864}
863 865
864/* 866/*
@@ -868,9 +870,9 @@ void __init smp_set_apicids(void)
868void __init smp_prepare_cpus(unsigned int max_cpus) 870void __init smp_prepare_cpus(unsigned int max_cpus)
869{ 871{
870 nmi_watchdog_default(); 872 nmi_watchdog_default();
873 smp_cpu_index_default();
871 current_cpu_data = boot_cpu_data; 874 current_cpu_data = boot_cpu_data;
872 current_thread_info()->cpu = 0; /* needed? */ 875 current_thread_info()->cpu = 0; /* needed? */
873 smp_set_apicids();
874 set_cpu_sibling_map(0); 876 set_cpu_sibling_map(0);
875 877
876 if (smp_sanity_check(max_cpus) < 0) { 878 if (smp_sanity_check(max_cpus) < 0) {
@@ -885,6 +887,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
885 */ 887 */
886 setup_local_APIC(); 888 setup_local_APIC();
887 889
890 /*
891 * Enable IO APIC before setting up error vector
892 */
893 if (!skip_ioapic_setup && nr_ioapics)
894 enable_IO_APIC();
895 end_local_APIC_setup();
896
888 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { 897 if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
889 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 898 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
890 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); 899 GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
@@ -903,7 +912,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
903 * Set up local APIC timer on boot CPU. 912 * Set up local APIC timer on boot CPU.
904 */ 913 */
905 914
906 setup_boot_APIC_clock(); 915 setup_boot_clock();
907} 916}
908 917
909/* 918/*
@@ -912,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
912void __init smp_prepare_boot_cpu(void) 921void __init smp_prepare_boot_cpu(void)
913{ 922{
914 int me = smp_processor_id(); 923 int me = smp_processor_id();
915 cpu_set(me, cpu_online_map); 924 /* already set me in cpu_online_map in boot_cpu_init() */
916 cpu_set(me, cpu_callout_map); 925 cpu_set(me, cpu_callout_map);
917 per_cpu(cpu_state, me) = CPU_ONLINE; 926 per_cpu(cpu_state, me) = CPU_ONLINE;
918} 927}
@@ -1010,13 +1019,13 @@ static void remove_siblinginfo(int cpu)
1010 cpu_clear(cpu, cpu_sibling_setup_map); 1019 cpu_clear(cpu, cpu_sibling_setup_map);
1011} 1020}
1012 1021
1013void remove_cpu_from_maps(void) 1022static void __ref remove_cpu_from_maps(void)
1014{ 1023{
1015 int cpu = smp_processor_id(); 1024 int cpu = smp_processor_id();
1016 1025
1017 cpu_clear(cpu, cpu_callout_map); 1026 cpu_clear(cpu, cpu_callout_map);
1018 cpu_clear(cpu, cpu_callin_map); 1027 cpu_clear(cpu, cpu_callin_map);
1019 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ 1028 clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
1020 clear_node_cpumask(cpu); 1029 clear_node_cpumask(cpu);
1021} 1030}
1022 1031
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
index bbfe85a0f699..8bc38af29aef 100644
--- a/arch/x86/kernel/smpcommon_32.c
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu)
14{ 14{
15 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 15 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
16 16
17 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, 17 pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
18 (u32 *)&gdt[GDT_ENTRY_PERCPU].b,
19 __per_cpu_offset[cpu], 0xFFFFF, 18 __per_cpu_offset[cpu], 0xFFFFF,
20 0x80 | DESCTYPE_S | 0x2, 0x8); 19 0x2 | DESCTYPE_S, 0x8);
20
21 gdt[GDT_ENTRY_PERCPU].s = 1;
21 22
22 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 23 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
23 per_cpu(cpu_number, cpu) = cpu; 24 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 2a8713ec0f9a..b72e61359c36 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
57static int num_memory_chunks; /* total number of memory chunks */ 57static int num_memory_chunks; /* total number of memory chunks */
58static u8 __initdata apicid_to_pxm[MAX_APICID]; 58static u8 __initdata apicid_to_pxm[MAX_APICID];
59 59
60extern void * boot_ioremap(unsigned long, unsigned long);
61
62/* Identify CPU proximity domains */ 60/* Identify CPU proximity domains */
63static void __init parse_cpu_affinity_structure(char *p) 61static void __init parse_cpu_affinity_structure(char *p)
64{ 62{
@@ -276,7 +274,7 @@ int __init get_memcfg_from_srat(void)
276 int tables = 0; 274 int tables = 0;
277 int i = 0; 275 int i = 0;
278 276
279 rsdp_address = acpi_find_rsdp(); 277 rsdp_address = acpi_os_get_root_pointer();
280 if (!rsdp_address) { 278 if (!rsdp_address) {
281 printk("%s: System description tables not found\n", 279 printk("%s: System description tables not found\n",
282 __FUNCTION__); 280 __FUNCTION__);
@@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void)
299 } 297 }
300 298
301 rsdt = (struct acpi_table_rsdt *) 299 rsdt = (struct acpi_table_rsdt *)
302 boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); 300 early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
303 301
304 if (!rsdt) { 302 if (!rsdt) {
305 printk(KERN_WARNING 303 printk(KERN_WARNING
@@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void)
339 for (i = 0; i < tables; i++) { 337 for (i = 0; i < tables; i++) {
340 /* Map in header, then map in full table length. */ 338 /* Map in header, then map in full table length. */
341 header = (struct acpi_table_header *) 339 header = (struct acpi_table_header *)
342 boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); 340 early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
343 if (!header) 341 if (!header)
344 break; 342 break;
345 header = (struct acpi_table_header *) 343 header = (struct acpi_table_header *)
346 boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); 344 early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
347 if (!header) 345 if (!header)
348 break; 346 break;
349 347
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6fa6cf036c70..02f0f61f5b11 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -22,9 +22,23 @@ static int save_stack_stack(void *data, char *name)
22 return -1; 22 return -1;
23} 23}
24 24
25static void save_stack_address(void *data, unsigned long addr) 25static void save_stack_address(void *data, unsigned long addr, int reliable)
26{
27 struct stack_trace *trace = data;
28 if (trace->skip > 0) {
29 trace->skip--;
30 return;
31 }
32 if (trace->nr_entries < trace->max_entries)
33 trace->entries[trace->nr_entries++] = addr;
34}
35
36static void
37save_stack_address_nosched(void *data, unsigned long addr, int reliable)
26{ 38{
27 struct stack_trace *trace = (struct stack_trace *)data; 39 struct stack_trace *trace = (struct stack_trace *)data;
40 if (in_sched_functions(addr))
41 return;
28 if (trace->skip > 0) { 42 if (trace->skip > 0) {
29 trace->skip--; 43 trace->skip--;
30 return; 44 return;
@@ -40,13 +54,26 @@ static const struct stacktrace_ops save_stack_ops = {
40 .address = save_stack_address, 54 .address = save_stack_address,
41}; 55};
42 56
57static const struct stacktrace_ops save_stack_ops_nosched = {
58 .warning = save_stack_warning,
59 .warning_symbol = save_stack_warning_symbol,
60 .stack = save_stack_stack,
61 .address = save_stack_address_nosched,
62};
63
43/* 64/*
44 * Save stack-backtrace addresses into a stack_trace buffer. 65 * Save stack-backtrace addresses into a stack_trace buffer.
45 */ 66 */
46void save_stack_trace(struct stack_trace *trace) 67void save_stack_trace(struct stack_trace *trace)
47{ 68{
48 dump_trace(current, NULL, NULL, &save_stack_ops, trace); 69 dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
70 if (trace->nr_entries < trace->max_entries)
71 trace->entries[trace->nr_entries++] = ULONG_MAX;
72}
73
74void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
75{
76 dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
49 if (trace->nr_entries < trace->max_entries) 77 if (trace->nr_entries < trace->max_entries)
50 trace->entries[trace->nr_entries++] = ULONG_MAX; 78 trace->entries[trace->nr_entries++] = ULONG_MAX;
51} 79}
52EXPORT_SYMBOL(save_stack_trace);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
new file mode 100644
index 000000000000..2ef1a5f8d675
--- /dev/null
+++ b/arch/x86/kernel/step.c
@@ -0,0 +1,203 @@
1/*
2 * x86 single-step support code, common to 32-bit and 64-bit.
3 */
4#include <linux/sched.h>
5#include <linux/mm.h>
6#include <linux/ptrace.h>
7
8unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
9{
10 unsigned long addr, seg;
11
12 addr = regs->ip;
13 seg = regs->cs & 0xffff;
14 if (v8086_mode(regs)) {
15 addr = (addr & 0xffff) + (seg << 4);
16 return addr;
17 }
18
19 /*
20 * We'll assume that the code segments in the GDT
21 * are all zero-based. That is largely true: the
22 * TLS segments are used for data, and the PNPBIOS
23 * and APM bios ones we just ignore here.
24 */
25 if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) {
26 u32 *desc;
27 unsigned long base;
28
29 seg &= ~7UL;
30
31 mutex_lock(&child->mm->context.lock);
32 if (unlikely((seg >> 3) >= child->mm->context.size))
33 addr = -1L; /* bogus selector, access would fault */
34 else {
35 desc = child->mm->context.ldt + seg;
36 base = ((desc[0] >> 16) |
37 ((desc[1] & 0xff) << 16) |
38 (desc[1] & 0xff000000));
39
40 /* 16-bit code segment? */
41 if (!((desc[1] >> 22) & 1))
42 addr &= 0xffff;
43 addr += base;
44 }
45 mutex_unlock(&child->mm->context.lock);
46 }
47
48 return addr;
49}
50
51static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
52{
53 int i, copied;
54 unsigned char opcode[15];
55 unsigned long addr = convert_ip_to_linear(child, regs);
56
57 copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
58 for (i = 0; i < copied; i++) {
59 switch (opcode[i]) {
60 /* popf and iret */
61 case 0x9d: case 0xcf:
62 return 1;
63
64 /* CHECKME: 64 65 */
65
66 /* opcode and address size prefixes */
67 case 0x66: case 0x67:
68 continue;
69 /* irrelevant prefixes (segment overrides and repeats) */
70 case 0x26: case 0x2e:
71 case 0x36: case 0x3e:
72 case 0x64: case 0x65:
73 case 0xf0: case 0xf2: case 0xf3:
74 continue;
75
76#ifdef CONFIG_X86_64
77 case 0x40 ... 0x4f:
78 if (regs->cs != __USER_CS)
79 /* 32-bit mode: register increment */
80 return 0;
81 /* 64-bit mode: REX prefix */
82 continue;
83#endif
84
85 /* CHECKME: f2, f3 */
86
87 /*
88 * pushf: NOTE! We should probably not let
89 * the user see the TF bit being set. But
90 * it's more pain than it's worth to avoid
91 * it, and a debugger could emulate this
92 * all in user space if it _really_ cares.
93 */
94 case 0x9c:
95 default:
96 return 0;
97 }
98 }
99 return 0;
100}
101
102/*
103 * Enable single-stepping. Return nonzero if user mode is not using TF itself.
104 */
105static int enable_single_step(struct task_struct *child)
106{
107 struct pt_regs *regs = task_pt_regs(child);
108
109 /*
110 * Always set TIF_SINGLESTEP - this guarantees that
111 * we single-step system calls etc.. This will also
112 * cause us to set TF when returning to user mode.
113 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115
116 /*
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121
122 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF;
124
125 /*
126 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later.
129 */
130 if (is_setting_trap_flag(child, regs))
131 return 0;
132
133 set_tsk_thread_flag(child, TIF_FORCED_TF);
134
135 return 1;
136}
137
138/*
139 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
140 */
141static void write_debugctlmsr(struct task_struct *child, unsigned long val)
142{
143 child->thread.debugctlmsr = val;
144
145 if (child != current)
146 return;
147
148 wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
149}
150
151/*
152 * Enable single or block step.
153 */
154static void enable_step(struct task_struct *child, bool block)
155{
156 /*
157 * Make sure block stepping (BTF) is not enabled unless it should be.
158 * Note that we don't try to worry about any is_setting_trap_flag()
159 * instructions after the first when using block stepping.
160 * So noone should try to use debugger block stepping in a program
161 * that uses user-mode single stepping itself.
162 */
163 if (enable_single_step(child) && block) {
164 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
165 write_debugctlmsr(child,
166 child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
167 } else {
168 write_debugctlmsr(child,
169 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
170
171 if (!child->thread.debugctlmsr)
172 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
173 }
174}
175
176void user_enable_single_step(struct task_struct *child)
177{
178 enable_step(child, 0);
179}
180
181void user_enable_block_step(struct task_struct *child)
182{
183 enable_step(child, 1);
184}
185
186void user_disable_single_step(struct task_struct *child)
187{
188 /*
189 * Make sure block stepping (BTF) is disabled.
190 */
191 write_debugctlmsr(child,
192 child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR);
193
194 if (!child->thread.debugctlmsr)
195 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
196
197 /* Always clear TIF_SINGLESTEP... */
198 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
199
200 /* But touch TF only if it was set by us.. */
201 if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))
202 task_pt_regs(child)->flags &= ~X86_EFLAGS_TF;
203}
diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c
index 2e5efaaf8800..7ac7130022f1 100644
--- a/arch/x86/kernel/suspend_64.c
+++ b/arch/x86/kernel/suspend_64.c
@@ -17,9 +17,26 @@
17/* References to section boundaries */ 17/* References to section boundaries */
18extern const void __nosave_begin, __nosave_end; 18extern const void __nosave_begin, __nosave_end;
19 19
20static void fix_processor_context(void);
21
20struct saved_context saved_context; 22struct saved_context saved_context;
21 23
22void __save_processor_state(struct saved_context *ctxt) 24/**
25 * __save_processor_state - save CPU registers before creating a
26 * hibernation image and before restoring the memory state from it
27 * @ctxt - structure to store the registers contents in
28 *
29 * NOTE: If there is a CPU register the modification of which by the
30 * boot kernel (ie. the kernel used for loading the hibernation image)
31 * might affect the operations of the restored target kernel (ie. the one
32 * saved in the hibernation image), then its contents must be saved by this
33 * function. In other words, if kernel A is hibernated and different
34 * kernel B is used for loading the hibernation image into memory, the
35 * kernel A's __save_processor_state() function must save all registers
36 * needed by kernel A, so that it can operate correctly after the resume
37 * regardless of what kernel B does in the meantime.
38 */
39static void __save_processor_state(struct saved_context *ctxt)
23{ 40{
24 kernel_fpu_begin(); 41 kernel_fpu_begin();
25 42
@@ -69,7 +86,12 @@ static void do_fpu_end(void)
69 kernel_fpu_end(); 86 kernel_fpu_end();
70} 87}
71 88
72void __restore_processor_state(struct saved_context *ctxt) 89/**
90 * __restore_processor_state - restore the contents of CPU registers saved
91 * by __save_processor_state()
92 * @ctxt - structure to load the registers contents from
93 */
94static void __restore_processor_state(struct saved_context *ctxt)
73{ 95{
74 /* 96 /*
75 * control registers 97 * control registers
@@ -113,14 +135,19 @@ void restore_processor_state(void)
113 __restore_processor_state(&saved_context); 135 __restore_processor_state(&saved_context);
114} 136}
115 137
116void fix_processor_context(void) 138static void fix_processor_context(void)
117{ 139{
118 int cpu = smp_processor_id(); 140 int cpu = smp_processor_id();
119 struct tss_struct *t = &per_cpu(init_tss, cpu); 141 struct tss_struct *t = &per_cpu(init_tss, cpu);
120 142
121 set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ 143 /*
144 * This just modifies memory; should not be necessary. But... This
145 * is necessary, because 386 hardware has concept of busy TSS or some
146 * similar stupidity.
147 */
148 set_tss_desc(cpu, t);
122 149
123 cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; 150 get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9;
124 151
125 syscall_init(); /* This sets MSR_*STAR and related */ 152 syscall_init(); /* This sets MSR_*STAR and related */
126 load_TR_desc(); /* This does ltr */ 153 load_TR_desc(); /* This does ltr */
@@ -138,7 +165,6 @@ void fix_processor_context(void)
138 loaddebug(&current->thread, 6); 165 loaddebug(&current->thread, 6);
139 loaddebug(&current->thread, 7); 166 loaddebug(&current->thread, 7);
140 } 167 }
141
142} 168}
143 169
144#ifdef CONFIG_HIBERNATION 170#ifdef CONFIG_HIBERNATION
diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S
index 72f952103e50..aeb9a4d7681e 100644
--- a/arch/x86/kernel/suspend_asm_64.S
+++ b/arch/x86/kernel/suspend_asm_64.S
@@ -18,13 +18,13 @@
18 18
19ENTRY(swsusp_arch_suspend) 19ENTRY(swsusp_arch_suspend)
20 movq $saved_context, %rax 20 movq $saved_context, %rax
21 movq %rsp, pt_regs_rsp(%rax) 21 movq %rsp, pt_regs_sp(%rax)
22 movq %rbp, pt_regs_rbp(%rax) 22 movq %rbp, pt_regs_bp(%rax)
23 movq %rsi, pt_regs_rsi(%rax) 23 movq %rsi, pt_regs_si(%rax)
24 movq %rdi, pt_regs_rdi(%rax) 24 movq %rdi, pt_regs_di(%rax)
25 movq %rbx, pt_regs_rbx(%rax) 25 movq %rbx, pt_regs_bx(%rax)
26 movq %rcx, pt_regs_rcx(%rax) 26 movq %rcx, pt_regs_cx(%rax)
27 movq %rdx, pt_regs_rdx(%rax) 27 movq %rdx, pt_regs_dx(%rax)
28 movq %r8, pt_regs_r8(%rax) 28 movq %r8, pt_regs_r8(%rax)
29 movq %r9, pt_regs_r9(%rax) 29 movq %r9, pt_regs_r9(%rax)
30 movq %r10, pt_regs_r10(%rax) 30 movq %r10, pt_regs_r10(%rax)
@@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend)
34 movq %r14, pt_regs_r14(%rax) 34 movq %r14, pt_regs_r14(%rax)
35 movq %r15, pt_regs_r15(%rax) 35 movq %r15, pt_regs_r15(%rax)
36 pushfq 36 pushfq
37 popq pt_regs_eflags(%rax) 37 popq pt_regs_flags(%rax)
38 38
39 /* save the address of restore_registers */ 39 /* save the address of restore_registers */
40 movq $restore_registers, %rax 40 movq $restore_registers, %rax
@@ -115,13 +115,13 @@ ENTRY(restore_registers)
115 115
116 /* We don't restore %rax, it must be 0 anyway */ 116 /* We don't restore %rax, it must be 0 anyway */
117 movq $saved_context, %rax 117 movq $saved_context, %rax
118 movq pt_regs_rsp(%rax), %rsp 118 movq pt_regs_sp(%rax), %rsp
119 movq pt_regs_rbp(%rax), %rbp 119 movq pt_regs_bp(%rax), %rbp
120 movq pt_regs_rsi(%rax), %rsi 120 movq pt_regs_si(%rax), %rsi
121 movq pt_regs_rdi(%rax), %rdi 121 movq pt_regs_di(%rax), %rdi
122 movq pt_regs_rbx(%rax), %rbx 122 movq pt_regs_bx(%rax), %rbx
123 movq pt_regs_rcx(%rax), %rcx 123 movq pt_regs_cx(%rax), %rcx
124 movq pt_regs_rdx(%rax), %rdx 124 movq pt_regs_dx(%rax), %rdx
125 movq pt_regs_r8(%rax), %r8 125 movq pt_regs_r8(%rax), %r8
126 movq pt_regs_r9(%rax), %r9 126 movq pt_regs_r9(%rax), %r9
127 movq pt_regs_r10(%rax), %r10 127 movq pt_regs_r10(%rax), %r10
@@ -130,7 +130,7 @@ ENTRY(restore_registers)
130 movq pt_regs_r13(%rax), %r13 130 movq pt_regs_r13(%rax), %r13
131 movq pt_regs_r14(%rax), %r14 131 movq pt_regs_r14(%rax), %r14
132 movq pt_regs_r15(%rax), %r15 132 movq pt_regs_r15(%rax), %r15
133 pushq pt_regs_eflags(%rax) 133 pushq pt_regs_flags(%rax)
134 popfq 134 popfq
135 135
136 xorq %rax, %rax 136 xorq %rax, %rax
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 907942ee6e76..bd802a5e1aa3 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -12,6 +12,7 @@
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17#include <asm/ia32.h> 18#include <asm/ia32.h>
@@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
65 unsigned long *end) 66 unsigned long *end)
66{ 67{
67 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { 68 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
69 unsigned long new_begin;
68 /* This is usually used needed to map code in small 70 /* This is usually used needed to map code in small
69 model, so it needs to be in the first 31bit. Limit 71 model, so it needs to be in the first 31bit. Limit
70 it to that. This means we need to move the 72 it to that. This means we need to move the
@@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
74 of playground for now. -AK */ 76 of playground for now. -AK */
75 *begin = 0x40000000; 77 *begin = 0x40000000;
76 *end = 0x80000000; 78 *end = 0x80000000;
79 if (current->flags & PF_RANDOMIZE) {
80 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
81 if (new_begin)
82 *begin = new_begin;
83 }
77 } else { 84 } else {
78 *begin = TASK_UNMAPPED_BASE; 85 *begin = TASK_UNMAPPED_BASE;
79 *end = TASK_SIZE; 86 *end = TASK_SIZE;
@@ -143,6 +150,97 @@ full_search:
143 } 150 }
144} 151}
145 152
153
154unsigned long
155arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
156 const unsigned long len, const unsigned long pgoff,
157 const unsigned long flags)
158{
159 struct vm_area_struct *vma;
160 struct mm_struct *mm = current->mm;
161 unsigned long addr = addr0;
162
163 /* requested length too big for entire address space */
164 if (len > TASK_SIZE)
165 return -ENOMEM;
166
167 if (flags & MAP_FIXED)
168 return addr;
169
170 /* for MAP_32BIT mappings we force the legact mmap base */
171 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
172 goto bottomup;
173
174 /* requesting a specific address */
175 if (addr) {
176 addr = PAGE_ALIGN(addr);
177 vma = find_vma(mm, addr);
178 if (TASK_SIZE - len >= addr &&
179 (!vma || addr + len <= vma->vm_start))
180 return addr;
181 }
182
183 /* check if free_area_cache is useful for us */
184 if (len <= mm->cached_hole_size) {
185 mm->cached_hole_size = 0;
186 mm->free_area_cache = mm->mmap_base;
187 }
188
189 /* either no address requested or can't fit in requested address hole */
190 addr = mm->free_area_cache;
191
192 /* make sure it can fit in the remaining address space */
193 if (addr > len) {
194 vma = find_vma(mm, addr-len);
195 if (!vma || addr <= vma->vm_start)
196 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr-len);
198 }
199
200 if (mm->mmap_base < len)
201 goto bottomup;
202
203 addr = mm->mmap_base-len;
204
205 do {
206 /*
207 * Lookup failure means no vma is above this address,
208 * else if new region fits below vma->vm_start,
209 * return with success:
210 */
211 vma = find_vma(mm, addr);
212 if (!vma || addr+len <= vma->vm_start)
213 /* remember the address as a hint for next time */
214 return (mm->free_area_cache = addr);
215
216 /* remember the largest hole we saw so far */
217 if (addr + mm->cached_hole_size < vma->vm_start)
218 mm->cached_hole_size = vma->vm_start - addr;
219
220 /* try just below the current vma->vm_start */
221 addr = vma->vm_start-len;
222 } while (len < vma->vm_start);
223
224bottomup:
225 /*
226 * A failed mmap() very likely causes application failure,
227 * so fall back to the bottom-up function here. This scenario
228 * can happen with large stack limits and large mmap()
229 * allocations.
230 */
231 mm->cached_hole_size = ~0UL;
232 mm->free_area_cache = TASK_UNMAPPED_BASE;
233 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
234 /*
235 * Restore the topdown base:
236 */
237 mm->free_area_cache = mm->mmap_base;
238 mm->cached_hole_size = ~0UL;
239
240 return addr;
241}
242
243
146asmlinkage long sys_uname(struct new_utsname __user * name) 244asmlinkage long sys_uname(struct new_utsname __user * name)
147{ 245{
148 int err; 246 int err;
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8344c70adf61..adff5562f5fd 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -321,6 +321,8 @@ ENTRY(sys_call_table)
321 .long sys_epoll_pwait 321 .long sys_epoll_pwait
322 .long sys_utimensat /* 320 */ 322 .long sys_utimensat /* 320 */
323 .long sys_signalfd 323 .long sys_signalfd
324 .long sys_timerfd 324 .long sys_timerfd_create
325 .long sys_eventfd 325 .long sys_eventfd
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime
diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/kernel/sysenter_32.c
deleted file mode 100644
index 5a2d951e2608..000000000000
--- a/arch/x86/kernel/sysenter_32.c
+++ /dev/null
@@ -1,346 +0,0 @@
1/*
2 * (C) Copyright 2002 Linus Torvalds
3 * Portions based on the vdso-randomization code from exec-shield:
4 * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
5 *
6 * This file contains the needed initializations to support sysenter.
7 */
8
9#include <linux/init.h>
10#include <linux/smp.h>
11#include <linux/thread_info.h>
12#include <linux/sched.h>
13#include <linux/gfp.h>
14#include <linux/string.h>
15#include <linux/elf.h>
16#include <linux/mm.h>
17#include <linux/err.h>
18#include <linux/module.h>
19
20#include <asm/cpufeature.h>
21#include <asm/msr.h>
22#include <asm/pgtable.h>
23#include <asm/unistd.h>
24#include <asm/elf.h>
25#include <asm/tlbflush.h>
26
27enum {
28 VDSO_DISABLED = 0,
29 VDSO_ENABLED = 1,
30 VDSO_COMPAT = 2,
31};
32
33#ifdef CONFIG_COMPAT_VDSO
34#define VDSO_DEFAULT VDSO_COMPAT
35#else
36#define VDSO_DEFAULT VDSO_ENABLED
37#endif
38
39/*
40 * Should the kernel map a VDSO page into processes and pass its
41 * address down to glibc upon exec()?
42 */
43unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT;
44
45EXPORT_SYMBOL_GPL(vdso_enabled);
46
47static int __init vdso_setup(char *s)
48{
49 vdso_enabled = simple_strtoul(s, NULL, 0);
50
51 return 1;
52}
53
54__setup("vdso=", vdso_setup);
55
56extern asmlinkage void sysenter_entry(void);
57
58static __init void reloc_symtab(Elf32_Ehdr *ehdr,
59 unsigned offset, unsigned size)
60{
61 Elf32_Sym *sym = (void *)ehdr + offset;
62 unsigned nsym = size / sizeof(*sym);
63 unsigned i;
64
65 for(i = 0; i < nsym; i++, sym++) {
66 if (sym->st_shndx == SHN_UNDEF ||
67 sym->st_shndx == SHN_ABS)
68 continue; /* skip */
69
70 if (sym->st_shndx > SHN_LORESERVE) {
71 printk(KERN_INFO "VDSO: unexpected st_shndx %x\n",
72 sym->st_shndx);
73 continue;
74 }
75
76 switch(ELF_ST_TYPE(sym->st_info)) {
77 case STT_OBJECT:
78 case STT_FUNC:
79 case STT_SECTION:
80 case STT_FILE:
81 sym->st_value += VDSO_HIGH_BASE;
82 }
83 }
84}
85
86static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset)
87{
88 Elf32_Dyn *dyn = (void *)ehdr + offset;
89
90 for(; dyn->d_tag != DT_NULL; dyn++)
91 switch(dyn->d_tag) {
92 case DT_PLTGOT:
93 case DT_HASH:
94 case DT_STRTAB:
95 case DT_SYMTAB:
96 case DT_RELA:
97 case DT_INIT:
98 case DT_FINI:
99 case DT_REL:
100 case DT_DEBUG:
101 case DT_JMPREL:
102 case DT_VERSYM:
103 case DT_VERDEF:
104 case DT_VERNEED:
105 case DT_ADDRRNGLO ... DT_ADDRRNGHI:
106 /* definitely pointers needing relocation */
107 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
108 break;
109
110 case DT_ENCODING ... OLD_DT_LOOS-1:
111 case DT_LOOS ... DT_HIOS-1:
112 /* Tags above DT_ENCODING are pointers if
113 they're even */
114 if (dyn->d_tag >= DT_ENCODING &&
115 (dyn->d_tag & 1) == 0)
116 dyn->d_un.d_ptr += VDSO_HIGH_BASE;
117 break;
118
119 case DT_VERDEFNUM:
120 case DT_VERNEEDNUM:
121 case DT_FLAGS_1:
122 case DT_RELACOUNT:
123 case DT_RELCOUNT:
124 case DT_VALRNGLO ... DT_VALRNGHI:
125 /* definitely not pointers */
126 break;
127
128 case OLD_DT_LOOS ... DT_LOOS-1:
129 case DT_HIOS ... DT_VALRNGLO-1:
130 default:
131 if (dyn->d_tag > DT_ENCODING)
132 printk(KERN_INFO "VDSO: unexpected DT_tag %x\n",
133 dyn->d_tag);
134 break;
135 }
136}
137
138static __init void relocate_vdso(Elf32_Ehdr *ehdr)
139{
140 Elf32_Phdr *phdr;
141 Elf32_Shdr *shdr;
142 int i;
143
144 BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 ||
145 !elf_check_arch(ehdr) ||
146 ehdr->e_type != ET_DYN);
147
148 ehdr->e_entry += VDSO_HIGH_BASE;
149
150 /* rebase phdrs */
151 phdr = (void *)ehdr + ehdr->e_phoff;
152 for (i = 0; i < ehdr->e_phnum; i++) {
153 phdr[i].p_vaddr += VDSO_HIGH_BASE;
154
155 /* relocate dynamic stuff */
156 if (phdr[i].p_type == PT_DYNAMIC)
157 reloc_dyn(ehdr, phdr[i].p_offset);
158 }
159
160 /* rebase sections */
161 shdr = (void *)ehdr + ehdr->e_shoff;
162 for(i = 0; i < ehdr->e_shnum; i++) {
163 if (!(shdr[i].sh_flags & SHF_ALLOC))
164 continue;
165
166 shdr[i].sh_addr += VDSO_HIGH_BASE;
167
168 if (shdr[i].sh_type == SHT_SYMTAB ||
169 shdr[i].sh_type == SHT_DYNSYM)
170 reloc_symtab(ehdr, shdr[i].sh_offset,
171 shdr[i].sh_size);
172 }
173}
174
175void enable_sep_cpu(void)
176{
177 int cpu = get_cpu();
178 struct tss_struct *tss = &per_cpu(init_tss, cpu);
179
180 if (!boot_cpu_has(X86_FEATURE_SEP)) {
181 put_cpu();
182 return;
183 }
184
185 tss->x86_tss.ss1 = __KERNEL_CS;
186 tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss;
187 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
188 wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0);
189 wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0);
190 put_cpu();
191}
192
193static struct vm_area_struct gate_vma;
194
195static int __init gate_vma_init(void)
196{
197 gate_vma.vm_mm = NULL;
198 gate_vma.vm_start = FIXADDR_USER_START;
199 gate_vma.vm_end = FIXADDR_USER_END;
200 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
201 gate_vma.vm_page_prot = __P101;
202 /*
203 * Make sure the vDSO gets into every core dump.
204 * Dumping its contents makes post-mortem fully interpretable later
205 * without matching up the same kernel and hardware config to see
206 * what PC values meant.
207 */
208 gate_vma.vm_flags |= VM_ALWAYSDUMP;
209 return 0;
210}
211
212/*
213 * These symbols are defined by vsyscall.o to mark the bounds
214 * of the ELF DSO images included therein.
215 */
216extern const char vsyscall_int80_start, vsyscall_int80_end;
217extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
218static struct page *syscall_pages[1];
219
220static void map_compat_vdso(int map)
221{
222 static int vdso_mapped;
223
224 if (map == vdso_mapped)
225 return;
226
227 vdso_mapped = map;
228
229 __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT,
230 map ? PAGE_READONLY_EXEC : PAGE_NONE);
231
232 /* flush stray tlbs */
233 flush_tlb_all();
234}
235
236int __init sysenter_setup(void)
237{
238 void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
239 const void *vsyscall;
240 size_t vsyscall_len;
241
242 syscall_pages[0] = virt_to_page(syscall_page);
243
244 gate_vma_init();
245
246 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
247
248 if (!boot_cpu_has(X86_FEATURE_SEP)) {
249 vsyscall = &vsyscall_int80_start;
250 vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start;
251 } else {
252 vsyscall = &vsyscall_sysenter_start;
253 vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start;
254 }
255
256 memcpy(syscall_page, vsyscall, vsyscall_len);
257 relocate_vdso(syscall_page);
258
259 return 0;
260}
261
262/* Defined in vsyscall-sysenter.S */
263extern void SYSENTER_RETURN;
264
265/* Setup a VMA at program startup for the vsyscall page */
266int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
267{
268 struct mm_struct *mm = current->mm;
269 unsigned long addr;
270 int ret = 0;
271 bool compat;
272
273 down_write(&mm->mmap_sem);
274
275 /* Test compat mode once here, in case someone
276 changes it via sysctl */
277 compat = (vdso_enabled == VDSO_COMPAT);
278
279 map_compat_vdso(compat);
280
281 if (compat)
282 addr = VDSO_HIGH_BASE;
283 else {
284 addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
285 if (IS_ERR_VALUE(addr)) {
286 ret = addr;
287 goto up_fail;
288 }
289
290 /*
291 * MAYWRITE to allow gdb to COW and set breakpoints
292 *
293 * Make sure the vDSO gets into every core dump.
294 * Dumping its contents makes post-mortem fully
295 * interpretable later without matching up the same
296 * kernel and hardware config to see what PC values
297 * meant.
298 */
299 ret = install_special_mapping(mm, addr, PAGE_SIZE,
300 VM_READ|VM_EXEC|
301 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
302 VM_ALWAYSDUMP,
303 syscall_pages);
304
305 if (ret)
306 goto up_fail;
307 }
308
309 current->mm->context.vdso = (void *)addr;
310 current_thread_info()->sysenter_return =
311 (void *)VDSO_SYM(&SYSENTER_RETURN);
312
313 up_fail:
314 up_write(&mm->mmap_sem);
315
316 return ret;
317}
318
319const char *arch_vma_name(struct vm_area_struct *vma)
320{
321 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
322 return "[vdso]";
323 return NULL;
324}
325
326struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
327{
328 struct mm_struct *mm = tsk->mm;
329
330 /* Check to see if this task was created in compat vdso mode */
331 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
332 return &gate_vma;
333 return NULL;
334}
335
336int in_gate_area(struct task_struct *task, unsigned long addr)
337{
338 const struct vm_area_struct *vma = get_gate_vma(task);
339
340 return vma && addr >= vma->vm_start && addr < vma->vm_end;
341}
342
343int in_gate_area_no_task(unsigned long addr)
344{
345 return 0;
346}
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
new file mode 100644
index 000000000000..10b8a6f69f84
--- /dev/null
+++ b/arch/x86/kernel/test_nx.c
@@ -0,0 +1,173 @@
1/*
2 * test_nx.c: functional test for NX functionality
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/module.h>
13#include <linux/sort.h>
14#include <asm/uaccess.h>
15#include <asm/asm.h>
16
17extern int rodata_test_data;
18
19/*
20 * This file checks 4 things:
21 * 1) Check if the stack is not executable
22 * 2) Check if kmalloc memory is not executable
23 * 3) Check if the .rodata section is not executable
24 * 4) Check if the .data section of a module is not executable
25 *
26 * To do this, the test code tries to execute memory in stack/kmalloc/etc,
27 * and then checks if the expected trap happens.
28 *
29 * Sadly, this implies having a dynamic exception handling table entry.
30 * ... which can be done (and will make Rusty cry)... but it can only
31 * be done in a stand-alone module with only 1 entry total.
32 * (otherwise we'd have to sort and that's just too messy)
33 */
34
35
36
37/*
38 * We want to set up an exception handling point on our stack,
39 * which means a variable value. This function is rather dirty
40 * and walks the exception table of the module, looking for a magic
41 * marker and replaces it with a specific function.
42 */
43static void fudze_exception_table(void *marker, void *new)
44{
45 struct module *mod = THIS_MODULE;
46 struct exception_table_entry *extable;
47
48 /*
49 * Note: This module has only 1 exception table entry,
50 * so searching and sorting is not needed. If that changes,
51 * this would be the place to search and re-sort the exception
52 * table.
53 */
54 if (mod->num_exentries > 1) {
55 printk(KERN_ERR "test_nx: too many exception table entries!\n");
56 printk(KERN_ERR "test_nx: test results are not reliable.\n");
57 return;
58 }
59 extable = (struct exception_table_entry *)mod->extable;
60 extable[0].insn = (unsigned long)new;
61}
62
63
64/*
65 * exception tables get their symbols translated so we need
66 * to use a fake function to put in there, which we can then
67 * replace at runtime.
68 */
69void foo_label(void);
70
71/*
72 * returns 0 for not-executable, negative for executable
73 *
74 * Note: we cannot allow this function to be inlined, because
75 * that would give us more than 1 exception table entry.
76 * This in turn would break the assumptions above.
77 */
78static noinline int test_address(void *address)
79{
80 unsigned long result;
81
82 /* Set up an exception table entry for our address */
83 fudze_exception_table(&foo_label, address);
84 result = 1;
85 asm volatile(
86 "foo_label:\n"
87 "0: call *%[fake_code]\n"
88 "1:\n"
89 ".section .fixup,\"ax\"\n"
90 "2: mov %[zero], %[rslt]\n"
91 " ret\n"
92 ".previous\n"
93 _ASM_EXTABLE(0b,2b)
94 : [rslt] "=r" (result)
95 : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result)
96 );
97 /* change the exception table back for the next round */
98 fudze_exception_table(address, &foo_label);
99
100 if (result)
101 return -ENODEV;
102 return 0;
103}
104
105static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */
106
107static int test_NX(void)
108{
109 int ret = 0;
110 /* 0xC3 is the opcode for "ret" */
111 char stackcode[] = {0xC3, 0x90, 0 };
112 char *heap;
113
114 test_data = 0xC3;
115
116 printk(KERN_INFO "Testing NX protection\n");
117
118 /* Test 1: check if the stack is not executable */
119 if (test_address(&stackcode)) {
120 printk(KERN_ERR "test_nx: stack was executable\n");
121 ret = -ENODEV;
122 }
123
124
125 /* Test 2: Check if the heap is executable */
126 heap = kmalloc(64, GFP_KERNEL);
127 if (!heap)
128 return -ENOMEM;
129 heap[0] = 0xC3; /* opcode for "ret" */
130
131 if (test_address(heap)) {
132 printk(KERN_ERR "test_nx: heap was executable\n");
133 ret = -ENODEV;
134 }
135 kfree(heap);
136
137 /*
138 * The following 2 tests currently fail, this needs to get fixed
139 * Until then, don't run them to avoid too many people getting scared
140 * by the error message
141 */
142
143#ifdef CONFIG_DEBUG_RODATA
144 /* Test 3: Check if the .rodata section is executable */
145 if (rodata_test_data != 0xC3) {
146 printk(KERN_ERR "test_nx: .rodata marker has invalid value\n");
147 ret = -ENODEV;
148 } else if (test_address(&rodata_test_data)) {
149 printk(KERN_ERR "test_nx: .rodata section is executable\n");
150 ret = -ENODEV;
151 }
152#endif
153
154#if 0
155 /* Test 4: Check if the .data section of a module is executable */
156 if (test_address(&test_data)) {
157 printk(KERN_ERR "test_nx: .data section is executable\n");
158 ret = -ENODEV;
159 }
160
161#endif
162 return 0;
163}
164
165static void test_exit(void)
166{
167}
168
169module_init(test_NX);
170module_exit(test_exit);
171MODULE_LICENSE("GPL");
172MODULE_DESCRIPTION("Testcase for the NX infrastructure");
173MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
new file mode 100644
index 000000000000..4c163772000e
--- /dev/null
+++ b/arch/x86/kernel/test_rodata.c
@@ -0,0 +1,86 @@
1/*
2 * test_rodata.c: functional test for mark_rodata_ro function
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/module.h>
13#include <asm/sections.h>
14extern int rodata_test_data;
15
16int rodata_test(void)
17{
18 unsigned long result;
19 unsigned long start, end;
20
21 /* test 1: read the value */
22 /* If this test fails, some previous testrun has clobbered the state */
23 if (!rodata_test_data) {
24 printk(KERN_ERR "rodata_test: test 1 fails (start data)\n");
25 return -ENODEV;
26 }
27
28 /* test 2: write to the variable; this should fault */
29 /*
30 * If this test fails, we managed to overwrite the data
31 *
32 * This is written in assembly to be able to catch the
33 * exception that is supposed to happen in the correct
34 * case
35 */
36
37 result = 1;
38 asm volatile(
39 "0: mov %[zero],(%[rodata_test])\n"
40 " mov %[zero], %[rslt]\n"
41 "1:\n"
42 ".section .fixup,\"ax\"\n"
43 "2: jmp 1b\n"
44 ".previous\n"
45 ".section __ex_table,\"a\"\n"
46 " .align 16\n"
47#ifdef CONFIG_X86_32
48 " .long 0b,2b\n"
49#else
50 " .quad 0b,2b\n"
51#endif
52 ".previous"
53 : [rslt] "=r" (result)
54 : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
55 );
56
57
58 if (!result) {
59 printk(KERN_ERR "rodata_test: test data was not read only\n");
60 return -ENODEV;
61 }
62
63 /* test 3: check the value hasn't changed */
64 /* If this test fails, we managed to overwrite the data */
65 if (!rodata_test_data) {
66 printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
67 return -ENODEV;
68 }
69 /* test 4: check if the rodata section is 4Kb aligned */
70 start = (unsigned long)__start_rodata;
71 end = (unsigned long)__end_rodata;
72 if (start & (PAGE_SIZE - 1)) {
73 printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n");
74 return -ENODEV;
75 }
76 if (end & (PAGE_SIZE - 1)) {
77 printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n");
78 return -ENODEV;
79 }
80
81 return 0;
82}
83
84MODULE_LICENSE("GPL");
85MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure");
86MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 8a322c96bc23..1a89e93f3f1c 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -28,98 +28,20 @@
28 * serialize accesses to xtime/lost_ticks). 28 * serialize accesses to xtime/lost_ticks).
29 */ 29 */
30 30
31#include <linux/errno.h> 31#include <linux/init.h>
32#include <linux/sched.h>
33#include <linux/kernel.h>
34#include <linux/param.h>
35#include <linux/string.h>
36#include <linux/mm.h>
37#include <linux/interrupt.h> 32#include <linux/interrupt.h>
38#include <linux/time.h> 33#include <linux/time.h>
39#include <linux/delay.h>
40#include <linux/init.h>
41#include <linux/smp.h>
42#include <linux/module.h>
43#include <linux/sysdev.h>
44#include <linux/bcd.h>
45#include <linux/efi.h>
46#include <linux/mca.h> 34#include <linux/mca.h>
47 35
48#include <asm/io.h>
49#include <asm/smp.h>
50#include <asm/irq.h>
51#include <asm/msr.h>
52#include <asm/delay.h>
53#include <asm/mpspec.h>
54#include <asm/uaccess.h>
55#include <asm/processor.h>
56#include <asm/timer.h>
57#include <asm/time.h>
58
59#include "mach_time.h"
60
61#include <linux/timex.h>
62
63#include <asm/hpet.h>
64
65#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
66 37#include <asm/hpet.h>
67#include "io_ports.h" 38#include <asm/time.h>
68
69#include <asm/i8259.h>
70 39
71#include "do_timer.h" 40#include "do_timer.h"
72 41
73unsigned int cpu_khz; /* Detected as we calibrate the TSC */ 42unsigned int cpu_khz; /* Detected as we calibrate the TSC */
74EXPORT_SYMBOL(cpu_khz); 43EXPORT_SYMBOL(cpu_khz);
75 44
76DEFINE_SPINLOCK(rtc_lock);
77EXPORT_SYMBOL(rtc_lock);
78
79/*
80 * This is a special lock that is owned by the CPU and holds the index
81 * register we are working with. It is required for NMI access to the
82 * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
83 */
84volatile unsigned long cmos_lock = 0;
85EXPORT_SYMBOL(cmos_lock);
86
87/* Routines for accessing the CMOS RAM/RTC. */
88unsigned char rtc_cmos_read(unsigned char addr)
89{
90 unsigned char val;
91 lock_cmos_prefix(addr);
92 outb_p(addr, RTC_PORT(0));
93 val = inb_p(RTC_PORT(1));
94 lock_cmos_suffix(addr);
95 return val;
96}
97EXPORT_SYMBOL(rtc_cmos_read);
98
99void rtc_cmos_write(unsigned char val, unsigned char addr)
100{
101 lock_cmos_prefix(addr);
102 outb_p(addr, RTC_PORT(0));
103 outb_p(val, RTC_PORT(1));
104 lock_cmos_suffix(addr);
105}
106EXPORT_SYMBOL(rtc_cmos_write);
107
108static int set_rtc_mmss(unsigned long nowtime)
109{
110 int retval;
111 unsigned long flags;
112
113 /* gets recalled with irq locally disabled */
114 /* XXX - does irqsave resolve this? -johnstul */
115 spin_lock_irqsave(&rtc_lock, flags);
116 retval = set_wallclock(nowtime);
117 spin_unlock_irqrestore(&rtc_lock, flags);
118
119 return retval;
120}
121
122
123int timer_ack; 45int timer_ack;
124 46
125unsigned long profile_pc(struct pt_regs *regs) 47unsigned long profile_pc(struct pt_regs *regs)
@@ -127,17 +49,17 @@ unsigned long profile_pc(struct pt_regs *regs)
127 unsigned long pc = instruction_pointer(regs); 49 unsigned long pc = instruction_pointer(regs);
128 50
129#ifdef CONFIG_SMP 51#ifdef CONFIG_SMP
130 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && 52 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
131 in_lock_functions(pc)) { 53 in_lock_functions(pc)) {
132#ifdef CONFIG_FRAME_POINTER 54#ifdef CONFIG_FRAME_POINTER
133 return *(unsigned long *)(regs->ebp + 4); 55 return *(unsigned long *)(regs->bp + 4);
134#else 56#else
135 unsigned long *sp = (unsigned long *)&regs->esp; 57 unsigned long *sp = (unsigned long *)&regs->sp;
136 58
137 /* Return address is either directly at stack pointer 59 /* Return address is either directly at stack pointer
138 or above a saved eflags. Eflags has bits 22-31 zero, 60 or above a saved flags. Eflags has bits 22-31 zero,
139 kernel addresses don't. */ 61 kernel addresses don't. */
140 if (sp[0] >> 22) 62 if (sp[0] >> 22)
141 return sp[0]; 63 return sp[0];
142 if (sp[1] >> 22) 64 if (sp[1] >> 22)
143 return sp[1]; 65 return sp[1];
@@ -193,26 +115,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
193 return IRQ_HANDLED; 115 return IRQ_HANDLED;
194} 116}
195 117
196/* not static: needed by APM */
197unsigned long read_persistent_clock(void)
198{
199 unsigned long retval;
200 unsigned long flags;
201
202 spin_lock_irqsave(&rtc_lock, flags);
203
204 retval = get_wallclock();
205
206 spin_unlock_irqrestore(&rtc_lock, flags);
207
208 return retval;
209}
210
211int update_persistent_clock(struct timespec now)
212{
213 return set_rtc_mmss(now.tv_sec);
214}
215
216extern void (*late_time_init)(void); 118extern void (*late_time_init)(void);
217/* Duplicate of time_init() below, with hpet_enable part added */ 119/* Duplicate of time_init() below, with hpet_enable part added */
218void __init hpet_time_init(void) 120void __init hpet_time_init(void)
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index 368b1942b39a..0380795121a6 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -11,43 +11,18 @@
11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c 11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
12 */ 12 */
13 13
14#include <linux/kernel.h> 14#include <linux/clockchips.h>
15#include <linux/sched.h>
16#include <linux/interrupt.h>
17#include <linux/init.h> 15#include <linux/init.h>
18#include <linux/mc146818rtc.h> 16#include <linux/interrupt.h>
19#include <linux/time.h>
20#include <linux/ioport.h>
21#include <linux/module.h> 17#include <linux/module.h>
22#include <linux/device.h> 18#include <linux/time.h>
23#include <linux/sysdev.h>
24#include <linux/bcd.h>
25#include <linux/notifier.h>
26#include <linux/cpu.h>
27#include <linux/kallsyms.h>
28#include <linux/acpi.h>
29#include <linux/clockchips.h>
30 19
31#ifdef CONFIG_ACPI
32#include <acpi/achware.h> /* for PM timer frequency */
33#include <acpi/acpi_bus.h>
34#endif
35#include <asm/i8253.h> 20#include <asm/i8253.h>
36#include <asm/pgtable.h>
37#include <asm/vsyscall.h>
38#include <asm/timex.h>
39#include <asm/proto.h>
40#include <asm/hpet.h>
41#include <asm/sections.h>
42#include <linux/hpet.h>
43#include <asm/apic.h>
44#include <asm/hpet.h> 21#include <asm/hpet.h>
45#include <asm/mpspec.h>
46#include <asm/nmi.h> 22#include <asm/nmi.h>
47#include <asm/vgtod.h> 23#include <asm/vgtod.h>
48 24#include <asm/time.h>
49DEFINE_SPINLOCK(rtc_lock); 25#include <asm/timer.h>
50EXPORT_SYMBOL(rtc_lock);
51 26
52volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 27volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
53 28
@@ -56,10 +31,10 @@ unsigned long profile_pc(struct pt_regs *regs)
56 unsigned long pc = instruction_pointer(regs); 31 unsigned long pc = instruction_pointer(regs);
57 32
58 /* Assume the lock function has either no stack frame or a copy 33 /* Assume the lock function has either no stack frame or a copy
59 of eflags from PUSHF 34 of flags from PUSHF
60 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 35 Eflags always has bits 22 and up cleared unlike kernel addresses. */
61 if (!user_mode(regs) && in_lock_functions(pc)) { 36 if (!user_mode(regs) && in_lock_functions(pc)) {
62 unsigned long *sp = (unsigned long *)regs->rsp; 37 unsigned long *sp = (unsigned long *)regs->sp;
63 if (sp[0] >> 22) 38 if (sp[0] >> 22)
64 return sp[0]; 39 return sp[0];
65 if (sp[1] >> 22) 40 if (sp[1] >> 22)
@@ -69,82 +44,6 @@ unsigned long profile_pc(struct pt_regs *regs)
69} 44}
70EXPORT_SYMBOL(profile_pc); 45EXPORT_SYMBOL(profile_pc);
71 46
72/*
73 * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500
74 * ms after the second nowtime has started, because when nowtime is written
75 * into the registers of the CMOS clock, it will jump to the next second
76 * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data
77 * sheet for details.
78 */
79
80static int set_rtc_mmss(unsigned long nowtime)
81{
82 int retval = 0;
83 int real_seconds, real_minutes, cmos_minutes;
84 unsigned char control, freq_select;
85 unsigned long flags;
86
87/*
88 * set_rtc_mmss is called when irqs are enabled, so disable irqs here
89 */
90 spin_lock_irqsave(&rtc_lock, flags);
91/*
92 * Tell the clock it's being set and stop it.
93 */
94 control = CMOS_READ(RTC_CONTROL);
95 CMOS_WRITE(control | RTC_SET, RTC_CONTROL);
96
97 freq_select = CMOS_READ(RTC_FREQ_SELECT);
98 CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT);
99
100 cmos_minutes = CMOS_READ(RTC_MINUTES);
101 BCD_TO_BIN(cmos_minutes);
102
103/*
104 * since we're only adjusting minutes and seconds, don't interfere with hour
105 * overflow. This avoids messing with unknown time zones but requires your RTC
106 * not to be off by more than 15 minutes. Since we're calling it only when
107 * our clock is externally synchronized using NTP, this shouldn't be a problem.
108 */
109
110 real_seconds = nowtime % 60;
111 real_minutes = nowtime / 60;
112 if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1)
113 real_minutes += 30; /* correct for half hour time zone */
114 real_minutes %= 60;
115
116 if (abs(real_minutes - cmos_minutes) >= 30) {
117 printk(KERN_WARNING "time.c: can't update CMOS clock "
118 "from %d to %d\n", cmos_minutes, real_minutes);
119 retval = -1;
120 } else {
121 BIN_TO_BCD(real_seconds);
122 BIN_TO_BCD(real_minutes);
123 CMOS_WRITE(real_seconds, RTC_SECONDS);
124 CMOS_WRITE(real_minutes, RTC_MINUTES);
125 }
126
127/*
128 * The following flags have to be released exactly in this order, otherwise the
129 * DS12887 (popular MC146818A clone with integrated battery and quartz) will
130 * not reset the oscillator and will not update precisely 500 ms later. You
131 * won't find this mentioned in the Dallas Semiconductor data sheets, but who
132 * believes data sheets anyway ... -- Markus Kuhn
133 */
134
135 CMOS_WRITE(control, RTC_CONTROL);
136 CMOS_WRITE(freq_select, RTC_FREQ_SELECT);
137
138 spin_unlock_irqrestore(&rtc_lock, flags);
139
140 return retval;
141}
142
143int update_persistent_clock(struct timespec now)
144{
145 return set_rtc_mmss(now.tv_sec);
146}
147
148static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 47static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
149{ 48{
150 add_pda(irq0_irqs, 1); 49 add_pda(irq0_irqs, 1);
@@ -154,67 +53,10 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
154 return IRQ_HANDLED; 53 return IRQ_HANDLED;
155} 54}
156 55
157unsigned long read_persistent_clock(void)
158{
159 unsigned int year, mon, day, hour, min, sec;
160 unsigned long flags;
161 unsigned century = 0;
162
163 spin_lock_irqsave(&rtc_lock, flags);
164 /*
165 * if UIP is clear, then we have >= 244 microseconds before RTC
166 * registers will be updated. Spec sheet says that this is the
167 * reliable way to read RTC - registers invalid (off bus) during update
168 */
169 while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
170 cpu_relax();
171
172
173 /* now read all RTC registers while stable with interrupts disabled */
174 sec = CMOS_READ(RTC_SECONDS);
175 min = CMOS_READ(RTC_MINUTES);
176 hour = CMOS_READ(RTC_HOURS);
177 day = CMOS_READ(RTC_DAY_OF_MONTH);
178 mon = CMOS_READ(RTC_MONTH);
179 year = CMOS_READ(RTC_YEAR);
180#ifdef CONFIG_ACPI
181 if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
182 acpi_gbl_FADT.century)
183 century = CMOS_READ(acpi_gbl_FADT.century);
184#endif
185 spin_unlock_irqrestore(&rtc_lock, flags);
186
187 /*
188 * We know that x86-64 always uses BCD format, no need to check the
189 * config register.
190 */
191
192 BCD_TO_BIN(sec);
193 BCD_TO_BIN(min);
194 BCD_TO_BIN(hour);
195 BCD_TO_BIN(day);
196 BCD_TO_BIN(mon);
197 BCD_TO_BIN(year);
198
199 if (century) {
200 BCD_TO_BIN(century);
201 year += century * 100;
202 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
203 } else {
204 /*
205 * x86-64 systems only exists since 2002.
206 * This will work up to Dec 31, 2100
207 */
208 year += 2000;
209 }
210
211 return mktime(year, mon, day, hour, min, sec);
212}
213
214/* calibrate_cpu is used on systems with fixed rate TSCs to determine 56/* calibrate_cpu is used on systems with fixed rate TSCs to determine
215 * processor frequency */ 57 * processor frequency */
216#define TICK_COUNT 100000000 58#define TICK_COUNT 100000000
217static unsigned int __init tsc_calibrate_cpu_khz(void) 59unsigned long __init native_calculate_cpu_khz(void)
218{ 60{
219 int tsc_start, tsc_now; 61 int tsc_start, tsc_now;
220 int i, no_ctr_free; 62 int i, no_ctr_free;
@@ -241,7 +83,7 @@ static unsigned int __init tsc_calibrate_cpu_khz(void)
241 rdtscl(tsc_start); 83 rdtscl(tsc_start);
242 do { 84 do {
243 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); 85 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
244 tsc_now = get_cycles_sync(); 86 tsc_now = get_cycles();
245 } while ((tsc_now - tsc_start) < TICK_COUNT); 87 } while ((tsc_now - tsc_start) < TICK_COUNT);
246 88
247 local_irq_restore(flags); 89 local_irq_restore(flags);
@@ -264,20 +106,22 @@ static struct irqaction irq0 = {
264 .name = "timer" 106 .name = "timer"
265}; 107};
266 108
267void __init time_init(void) 109void __init hpet_time_init(void)
268{ 110{
269 if (!hpet_enable()) 111 if (!hpet_enable())
270 setup_pit_timer(); 112 setup_pit_timer();
271 113
272 setup_irq(0, &irq0); 114 setup_irq(0, &irq0);
115}
273 116
117void __init time_init(void)
118{
274 tsc_calibrate(); 119 tsc_calibrate();
275 120
276 cpu_khz = tsc_khz; 121 cpu_khz = tsc_khz;
277 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && 122 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
278 boot_cpu_data.x86_vendor == X86_VENDOR_AMD && 123 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
279 boot_cpu_data.x86 == 16) 124 cpu_khz = calculate_cpu_khz();
280 cpu_khz = tsc_calibrate_cpu_khz();
281 125
282 if (unsynchronized_tsc()) 126 if (unsynchronized_tsc())
283 mark_tsc_unstable("TSCs unsynchronized"); 127 mark_tsc_unstable("TSCs unsynchronized");
@@ -290,4 +134,5 @@ void __init time_init(void)
290 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", 134 printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
291 cpu_khz / 1000, cpu_khz % 1000); 135 cpu_khz / 1000, cpu_khz % 1000);
292 init_tsc_clocksource(); 136 init_tsc_clocksource();
137 late_time_init = choose_time_init();
293} 138}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
new file mode 100644
index 000000000000..6dfd4e76661a
--- /dev/null
+++ b/arch/x86/kernel/tls.c
@@ -0,0 +1,213 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/sched.h>
4#include <linux/user.h>
5#include <linux/regset.h>
6
7#include <asm/uaccess.h>
8#include <asm/desc.h>
9#include <asm/system.h>
10#include <asm/ldt.h>
11#include <asm/processor.h>
12#include <asm/proto.h>
13
14#include "tls.h"
15
16/*
17 * sys_alloc_thread_area: get a yet unused TLS descriptor index.
18 */
19static int get_free_idx(void)
20{
21 struct thread_struct *t = &current->thread;
22 int idx;
23
24 for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
25 if (desc_empty(&t->tls_array[idx]))
26 return idx + GDT_ENTRY_TLS_MIN;
27 return -ESRCH;
28}
29
30static void set_tls_desc(struct task_struct *p, int idx,
31 const struct user_desc *info, int n)
32{
33 struct thread_struct *t = &p->thread;
34 struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN];
35 int cpu;
36
37 /*
38 * We must not get preempted while modifying the TLS.
39 */
40 cpu = get_cpu();
41
42 while (n-- > 0) {
43 if (LDT_empty(info))
44 desc->a = desc->b = 0;
45 else
46 fill_ldt(desc, info);
47 ++info;
48 ++desc;
49 }
50
51 if (t == &current->thread)
52 load_TLS(t, cpu);
53
54 put_cpu();
55}
56
57/*
58 * Set a given TLS descriptor:
59 */
60int do_set_thread_area(struct task_struct *p, int idx,
61 struct user_desc __user *u_info,
62 int can_allocate)
63{
64 struct user_desc info;
65
66 if (copy_from_user(&info, u_info, sizeof(info)))
67 return -EFAULT;
68
69 if (idx == -1)
70 idx = info.entry_number;
71
72 /*
73 * index -1 means the kernel should try to find and
74 * allocate an empty descriptor:
75 */
76 if (idx == -1 && can_allocate) {
77 idx = get_free_idx();
78 if (idx < 0)
79 return idx;
80 if (put_user(idx, &u_info->entry_number))
81 return -EFAULT;
82 }
83
84 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
85 return -EINVAL;
86
87 set_tls_desc(p, idx, &info, 1);
88
89 return 0;
90}
91
92asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
93{
94 return do_set_thread_area(current, -1, u_info, 1);
95}
96
97
98/*
99 * Get the current Thread-Local Storage area:
100 */
101
102static void fill_user_desc(struct user_desc *info, int idx,
103 const struct desc_struct *desc)
104
105{
106 memset(info, 0, sizeof(*info));
107 info->entry_number = idx;
108 info->base_addr = get_desc_base(desc);
109 info->limit = get_desc_limit(desc);
110 info->seg_32bit = desc->d;
111 info->contents = desc->type >> 2;
112 info->read_exec_only = !(desc->type & 2);
113 info->limit_in_pages = desc->g;
114 info->seg_not_present = !desc->p;
115 info->useable = desc->avl;
116#ifdef CONFIG_X86_64
117 info->lm = desc->l;
118#endif
119}
120
121int do_get_thread_area(struct task_struct *p, int idx,
122 struct user_desc __user *u_info)
123{
124 struct user_desc info;
125
126 if (idx == -1 && get_user(idx, &u_info->entry_number))
127 return -EFAULT;
128
129 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
130 return -EINVAL;
131
132 fill_user_desc(&info, idx,
133 &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]);
134
135 if (copy_to_user(u_info, &info, sizeof(info)))
136 return -EFAULT;
137 return 0;
138}
139
140asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
141{
142 return do_get_thread_area(current, -1, u_info);
143}
144
145int regset_tls_active(struct task_struct *target,
146 const struct user_regset *regset)
147{
148 struct thread_struct *t = &target->thread;
149 int n = GDT_ENTRY_TLS_ENTRIES;
150 while (n > 0 && desc_empty(&t->tls_array[n - 1]))
151 --n;
152 return n;
153}
154
155int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
156 unsigned int pos, unsigned int count,
157 void *kbuf, void __user *ubuf)
158{
159 const struct desc_struct *tls;
160
161 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
162 (pos % sizeof(struct user_desc)) != 0 ||
163 (count % sizeof(struct user_desc)) != 0)
164 return -EINVAL;
165
166 pos /= sizeof(struct user_desc);
167 count /= sizeof(struct user_desc);
168
169 tls = &target->thread.tls_array[pos];
170
171 if (kbuf) {
172 struct user_desc *info = kbuf;
173 while (count-- > 0)
174 fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++,
175 tls++);
176 } else {
177 struct user_desc __user *u_info = ubuf;
178 while (count-- > 0) {
179 struct user_desc info;
180 fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++);
181 if (__copy_to_user(u_info++, &info, sizeof(info)))
182 return -EFAULT;
183 }
184 }
185
186 return 0;
187}
188
189int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
190 unsigned int pos, unsigned int count,
191 const void *kbuf, const void __user *ubuf)
192{
193 struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
194 const struct user_desc *info;
195
196 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
197 (pos % sizeof(struct user_desc)) != 0 ||
198 (count % sizeof(struct user_desc)) != 0)
199 return -EINVAL;
200
201 if (kbuf)
202 info = kbuf;
203 else if (__copy_from_user(infobuf, ubuf, count))
204 return -EFAULT;
205 else
206 info = infobuf;
207
208 set_tls_desc(target,
209 GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)),
210 info, count / sizeof(struct user_desc));
211
212 return 0;
213}
diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h
new file mode 100644
index 000000000000..2f083a2fe216
--- /dev/null
+++ b/arch/x86/kernel/tls.h
@@ -0,0 +1,21 @@
1/*
2 * Internal declarations for x86 TLS implementation functions.
3 *
4 * Copyright (C) 2007 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * Red Hat Author: Roland McGrath.
11 */
12
13#ifndef _ARCH_X86_KERNEL_TLS_H
14
15#include <linux/regset.h>
16
17extern user_regset_active_fn regset_tls_active;
18extern user_regset_get_fn regset_tls_get;
19extern user_regset_set_fn regset_tls_set;
20
21#endif /* _ARCH_X86_KERNEL_TLS_H */
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e16d675eb85..e6757aaa202b 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,9 +31,10 @@
31#include <linux/mmzone.h> 31#include <linux/mmzone.h>
32#include <asm/cpu.h> 32#include <asm/cpu.h>
33 33
34static struct i386_cpu cpu_devices[NR_CPUS]; 34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
35 35
36int __cpuinit arch_register_cpu(int num) 36#ifdef CONFIG_HOTPLUG_CPU
37int arch_register_cpu(int num)
37{ 38{
38 /* 39 /*
39 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
@@ -44,21 +45,22 @@ int __cpuinit arch_register_cpu(int num)
44 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's. 46 * for all CPU's.
46 */ 47 */
47#ifdef CONFIG_HOTPLUG_CPU
48 if (num) 48 if (num)
49 cpu_devices[num].cpu.hotpluggable = 1; 49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
50#endif 50 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
51
52 return register_cpu(&cpu_devices[num].cpu, num);
53} 51}
52EXPORT_SYMBOL(arch_register_cpu);
54 53
55#ifdef CONFIG_HOTPLUG_CPU
56void arch_unregister_cpu(int num) 54void arch_unregister_cpu(int num)
57{ 55{
58 return unregister_cpu(&cpu_devices[num].cpu); 56 return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
59} 57}
60EXPORT_SYMBOL(arch_register_cpu);
61EXPORT_SYMBOL(arch_unregister_cpu); 58EXPORT_SYMBOL(arch_unregister_cpu);
59#else
60static int __init arch_register_cpu(int num)
61{
62 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
63}
62#endif /*CONFIG_HOTPLUG_CPU*/ 64#endif /*CONFIG_HOTPLUG_CPU*/
63 65
64static int __init topology_init(void) 66static int __init topology_init(void)
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 9bcc1c6aca3d..64580679861e 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -11,12 +11,7 @@
11 * trampoline page to make our stack and everything else 11 * trampoline page to make our stack and everything else
12 * is a mystery. 12 * is a mystery.
13 * 13 *
14 * In fact we don't actually need a stack so we don't 14 * We jump into arch/x86/kernel/head_32.S.
15 * set one up.
16 *
17 * We jump into the boot/compressed/head.S code. So you'd
18 * better be running a compressed kernel image or you
19 * won't get very far.
20 * 15 *
21 * On entry to trampoline_data, the processor is in real mode 16 * On entry to trampoline_data, the processor is in real mode
22 * with 16-bit addressing and 16-bit data. CS has some value 17 * with 16-bit addressing and 16-bit data. CS has some value
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index e30b67c6a9f5..4aedd0bcee4c 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -10,9 +10,6 @@
10 * trampoline page to make our stack and everything else 10 * trampoline page to make our stack and everything else
11 * is a mystery. 11 * is a mystery.
12 * 12 *
13 * In fact we don't actually need a stack so we don't
14 * set one up.
15 *
16 * On entry to trampoline_data, the processor is in real mode 13 * On entry to trampoline_data, the processor is in real mode
17 * with 16-bit addressing and 16-bit data. CS has some value 14 * with 16-bit addressing and 16-bit data. CS has some value
18 * and IP is zero. Thus, data addresses need to be absolute 15 * and IP is zero. Thus, data addresses need to be absolute
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index c88bbffcaa03..b22c01e05a18 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -76,7 +76,8 @@ char ignore_fpu_irq = 0;
76 * F0 0F bug workaround.. We have a special link segment 76 * F0 0F bug workaround.. We have a special link segment
77 * for this. 77 * for this.
78 */ 78 */
79struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; 79gate_desc idt_table[256]
80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80 81
81asmlinkage void divide_error(void); 82asmlinkage void divide_error(void);
82asmlinkage void debug(void); 83asmlinkage void debug(void);
@@ -101,6 +102,34 @@ asmlinkage void machine_check(void);
101int kstack_depth_to_print = 24; 102int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64; 103static unsigned int code_bytes = 64;
103 104
105void printk_address(unsigned long address, int reliable)
106{
107#ifdef CONFIG_KALLSYMS
108 unsigned long offset = 0, symsize;
109 const char *symname;
110 char *modname;
111 char *delim = ":";
112 char namebuf[128];
113 char reliab[4] = "";
114
115 symname = kallsyms_lookup(address, &symsize, &offset,
116 &modname, namebuf);
117 if (!symname) {
118 printk(" [<%08lx>]\n", address);
119 return;
120 }
121 if (!reliable)
122 strcpy(reliab, "? ");
123
124 if (!modname)
125 modname = delim = "";
126 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
127 address, reliab, delim, modname, delim, symname, offset, symsize);
128#else
129 printk(" [<%08lx>]\n", address);
130#endif
131}
132
104static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) 133static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size)
105{ 134{
106 return p > (void *)tinfo && 135 return p > (void *)tinfo &&
@@ -114,48 +143,35 @@ struct stack_frame {
114}; 143};
115 144
116static inline unsigned long print_context_stack(struct thread_info *tinfo, 145static inline unsigned long print_context_stack(struct thread_info *tinfo,
117 unsigned long *stack, unsigned long ebp, 146 unsigned long *stack, unsigned long bp,
118 const struct stacktrace_ops *ops, void *data) 147 const struct stacktrace_ops *ops, void *data)
119{ 148{
120#ifdef CONFIG_FRAME_POINTER 149 struct stack_frame *frame = (struct stack_frame *)bp;
121 struct stack_frame *frame = (struct stack_frame *)ebp;
122 while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) {
123 struct stack_frame *next;
124 unsigned long addr;
125 150
126 addr = frame->return_address;
127 ops->address(data, addr);
128 /*
129 * break out of recursive entries (such as
130 * end_of_stack_stop_unwind_function). Also,
131 * we can never allow a frame pointer to
132 * move downwards!
133 */
134 next = frame->next_frame;
135 if (next <= frame)
136 break;
137 frame = next;
138 }
139#else
140 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { 151 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
141 unsigned long addr; 152 unsigned long addr;
142 153
143 addr = *stack++; 154 addr = *stack;
144 if (__kernel_text_address(addr)) 155 if (__kernel_text_address(addr)) {
145 ops->address(data, addr); 156 if ((unsigned long) stack == bp + 4) {
157 ops->address(data, addr, 1);
158 frame = frame->next_frame;
159 bp = (unsigned long) frame;
160 } else {
161 ops->address(data, addr, bp == 0);
162 }
163 }
164 stack++;
146 } 165 }
147#endif 166 return bp;
148 return ebp;
149} 167}
150 168
151#define MSG(msg) ops->warning(data, msg) 169#define MSG(msg) ops->warning(data, msg)
152 170
153void dump_trace(struct task_struct *task, struct pt_regs *regs, 171void dump_trace(struct task_struct *task, struct pt_regs *regs,
154 unsigned long *stack, 172 unsigned long *stack, unsigned long bp,
155 const struct stacktrace_ops *ops, void *data) 173 const struct stacktrace_ops *ops, void *data)
156{ 174{
157 unsigned long ebp = 0;
158
159 if (!task) 175 if (!task)
160 task = current; 176 task = current;
161 177
@@ -163,17 +179,17 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
163 unsigned long dummy; 179 unsigned long dummy;
164 stack = &dummy; 180 stack = &dummy;
165 if (task != current) 181 if (task != current)
166 stack = (unsigned long *)task->thread.esp; 182 stack = (unsigned long *)task->thread.sp;
167 } 183 }
168 184
169#ifdef CONFIG_FRAME_POINTER 185#ifdef CONFIG_FRAME_POINTER
170 if (!ebp) { 186 if (!bp) {
171 if (task == current) { 187 if (task == current) {
172 /* Grab ebp right from our regs */ 188 /* Grab bp right from our regs */
173 asm ("movl %%ebp, %0" : "=r" (ebp) : ); 189 asm ("movl %%ebp, %0" : "=r" (bp) : );
174 } else { 190 } else {
175 /* ebp is the last reg pushed by switch_to */ 191 /* bp is the last reg pushed by switch_to */
176 ebp = *(unsigned long *) task->thread.esp; 192 bp = *(unsigned long *) task->thread.sp;
177 } 193 }
178 } 194 }
179#endif 195#endif
@@ -182,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
182 struct thread_info *context; 198 struct thread_info *context;
183 context = (struct thread_info *) 199 context = (struct thread_info *)
184 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 200 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
185 ebp = print_context_stack(context, stack, ebp, ops, data); 201 bp = print_context_stack(context, stack, bp, ops, data);
186 /* Should be after the line below, but somewhere 202 /* Should be after the line below, but somewhere
187 in early boot context comes out corrupted and we 203 in early boot context comes out corrupted and we
188 can't reference it -AK */ 204 can't reference it -AK */
@@ -217,9 +233,11 @@ static int print_trace_stack(void *data, char *name)
217/* 233/*
218 * Print one address/symbol entries per line. 234 * Print one address/symbol entries per line.
219 */ 235 */
220static void print_trace_address(void *data, unsigned long addr) 236static void print_trace_address(void *data, unsigned long addr, int reliable)
221{ 237{
222 printk("%s [<%08lx>] ", (char *)data, addr); 238 printk("%s [<%08lx>] ", (char *)data, addr);
239 if (!reliable)
240 printk("? ");
223 print_symbol("%s\n", addr); 241 print_symbol("%s\n", addr);
224 touch_nmi_watchdog(); 242 touch_nmi_watchdog();
225} 243}
@@ -233,32 +251,32 @@ static const struct stacktrace_ops print_trace_ops = {
233 251
234static void 252static void
235show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 253show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
236 unsigned long * stack, char *log_lvl) 254 unsigned long *stack, unsigned long bp, char *log_lvl)
237{ 255{
238 dump_trace(task, regs, stack, &print_trace_ops, log_lvl); 256 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
239 printk("%s =======================\n", log_lvl); 257 printk("%s =======================\n", log_lvl);
240} 258}
241 259
242void show_trace(struct task_struct *task, struct pt_regs *regs, 260void show_trace(struct task_struct *task, struct pt_regs *regs,
243 unsigned long * stack) 261 unsigned long *stack, unsigned long bp)
244{ 262{
245 show_trace_log_lvl(task, regs, stack, ""); 263 show_trace_log_lvl(task, regs, stack, bp, "");
246} 264}
247 265
248static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 266static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
249 unsigned long *esp, char *log_lvl) 267 unsigned long *sp, unsigned long bp, char *log_lvl)
250{ 268{
251 unsigned long *stack; 269 unsigned long *stack;
252 int i; 270 int i;
253 271
254 if (esp == NULL) { 272 if (sp == NULL) {
255 if (task) 273 if (task)
256 esp = (unsigned long*)task->thread.esp; 274 sp = (unsigned long*)task->thread.sp;
257 else 275 else
258 esp = (unsigned long *)&esp; 276 sp = (unsigned long *)&sp;
259 } 277 }
260 278
261 stack = esp; 279 stack = sp;
262 for(i = 0; i < kstack_depth_to_print; i++) { 280 for(i = 0; i < kstack_depth_to_print; i++) {
263 if (kstack_end(stack)) 281 if (kstack_end(stack))
264 break; 282 break;
@@ -267,13 +285,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
267 printk("%08lx ", *stack++); 285 printk("%08lx ", *stack++);
268 } 286 }
269 printk("\n%sCall Trace:\n", log_lvl); 287 printk("\n%sCall Trace:\n", log_lvl);
270 show_trace_log_lvl(task, regs, esp, log_lvl); 288 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
271} 289}
272 290
273void show_stack(struct task_struct *task, unsigned long *esp) 291void show_stack(struct task_struct *task, unsigned long *sp)
274{ 292{
275 printk(" "); 293 printk(" ");
276 show_stack_log_lvl(task, NULL, esp, ""); 294 show_stack_log_lvl(task, NULL, sp, 0, "");
277} 295}
278 296
279/* 297/*
@@ -282,13 +300,19 @@ void show_stack(struct task_struct *task, unsigned long *esp)
282void dump_stack(void) 300void dump_stack(void)
283{ 301{
284 unsigned long stack; 302 unsigned long stack;
303 unsigned long bp = 0;
304
305#ifdef CONFIG_FRAME_POINTER
306 if (!bp)
307 asm("movl %%ebp, %0" : "=r" (bp):);
308#endif
285 309
286 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 310 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
287 current->pid, current->comm, print_tainted(), 311 current->pid, current->comm, print_tainted(),
288 init_utsname()->release, 312 init_utsname()->release,
289 (int)strcspn(init_utsname()->version, " "), 313 (int)strcspn(init_utsname()->version, " "),
290 init_utsname()->version); 314 init_utsname()->version);
291 show_trace(current, NULL, &stack); 315 show_trace(current, NULL, &stack, bp);
292} 316}
293 317
294EXPORT_SYMBOL(dump_stack); 318EXPORT_SYMBOL(dump_stack);
@@ -307,30 +331,30 @@ void show_registers(struct pt_regs *regs)
307 * time of the fault.. 331 * time of the fault..
308 */ 332 */
309 if (!user_mode_vm(regs)) { 333 if (!user_mode_vm(regs)) {
310 u8 *eip; 334 u8 *ip;
311 unsigned int code_prologue = code_bytes * 43 / 64; 335 unsigned int code_prologue = code_bytes * 43 / 64;
312 unsigned int code_len = code_bytes; 336 unsigned int code_len = code_bytes;
313 unsigned char c; 337 unsigned char c;
314 338
315 printk("\n" KERN_EMERG "Stack: "); 339 printk("\n" KERN_EMERG "Stack: ");
316 show_stack_log_lvl(NULL, regs, &regs->esp, KERN_EMERG); 340 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
317 341
318 printk(KERN_EMERG "Code: "); 342 printk(KERN_EMERG "Code: ");
319 343
320 eip = (u8 *)regs->eip - code_prologue; 344 ip = (u8 *)regs->ip - code_prologue;
321 if (eip < (u8 *)PAGE_OFFSET || 345 if (ip < (u8 *)PAGE_OFFSET ||
322 probe_kernel_address(eip, c)) { 346 probe_kernel_address(ip, c)) {
323 /* try starting at EIP */ 347 /* try starting at EIP */
324 eip = (u8 *)regs->eip; 348 ip = (u8 *)regs->ip;
325 code_len = code_len - code_prologue + 1; 349 code_len = code_len - code_prologue + 1;
326 } 350 }
327 for (i = 0; i < code_len; i++, eip++) { 351 for (i = 0; i < code_len; i++, ip++) {
328 if (eip < (u8 *)PAGE_OFFSET || 352 if (ip < (u8 *)PAGE_OFFSET ||
329 probe_kernel_address(eip, c)) { 353 probe_kernel_address(ip, c)) {
330 printk(" Bad EIP value."); 354 printk(" Bad EIP value.");
331 break; 355 break;
332 } 356 }
333 if (eip == (u8 *)regs->eip) 357 if (ip == (u8 *)regs->ip)
334 printk("<%02x> ", c); 358 printk("<%02x> ", c);
335 else 359 else
336 printk("%02x ", c); 360 printk("%02x ", c);
@@ -339,18 +363,57 @@ void show_registers(struct pt_regs *regs)
339 printk("\n"); 363 printk("\n");
340} 364}
341 365
342int is_valid_bugaddr(unsigned long eip) 366int is_valid_bugaddr(unsigned long ip)
343{ 367{
344 unsigned short ud2; 368 unsigned short ud2;
345 369
346 if (eip < PAGE_OFFSET) 370 if (ip < PAGE_OFFSET)
347 return 0; 371 return 0;
348 if (probe_kernel_address((unsigned short *)eip, ud2)) 372 if (probe_kernel_address((unsigned short *)ip, ud2))
349 return 0; 373 return 0;
350 374
351 return ud2 == 0x0b0f; 375 return ud2 == 0x0b0f;
352} 376}
353 377
378static int die_counter;
379
380int __kprobes __die(const char * str, struct pt_regs * regs, long err)
381{
382 unsigned long sp;
383 unsigned short ss;
384
385 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
386#ifdef CONFIG_PREEMPT
387 printk("PREEMPT ");
388#endif
389#ifdef CONFIG_SMP
390 printk("SMP ");
391#endif
392#ifdef CONFIG_DEBUG_PAGEALLOC
393 printk("DEBUG_PAGEALLOC");
394#endif
395 printk("\n");
396
397 if (notify_die(DIE_OOPS, str, regs, err,
398 current->thread.trap_no, SIGSEGV) !=
399 NOTIFY_STOP) {
400 show_registers(regs);
401 /* Executive summary in case the oops scrolled away */
402 sp = (unsigned long) (&regs->sp);
403 savesegment(ss, ss);
404 if (user_mode(regs)) {
405 sp = regs->sp;
406 ss = regs->ss & 0xffff;
407 }
408 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
409 print_symbol("%s", regs->ip);
410 printk(" SS:ESP %04x:%08lx\n", ss, sp);
411 return 0;
412 } else {
413 return 1;
414 }
415}
416
354/* 417/*
355 * This is gone through when something in the kernel has done something bad and 418 * This is gone through when something in the kernel has done something bad and
356 * is about to be terminated. 419 * is about to be terminated.
@@ -366,7 +429,6 @@ void die(const char * str, struct pt_regs * regs, long err)
366 .lock_owner = -1, 429 .lock_owner = -1,
367 .lock_owner_depth = 0 430 .lock_owner_depth = 0
368 }; 431 };
369 static int die_counter;
370 unsigned long flags; 432 unsigned long flags;
371 433
372 oops_enter(); 434 oops_enter();
@@ -382,43 +444,13 @@ void die(const char * str, struct pt_regs * regs, long err)
382 raw_local_irq_save(flags); 444 raw_local_irq_save(flags);
383 445
384 if (++die.lock_owner_depth < 3) { 446 if (++die.lock_owner_depth < 3) {
385 unsigned long esp; 447 report_bug(regs->ip, regs);
386 unsigned short ss;
387
388 report_bug(regs->eip, regs);
389 448
390 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, 449 if (__die(str, regs, err))
391 ++die_counter);
392#ifdef CONFIG_PREEMPT
393 printk("PREEMPT ");
394#endif
395#ifdef CONFIG_SMP
396 printk("SMP ");
397#endif
398#ifdef CONFIG_DEBUG_PAGEALLOC
399 printk("DEBUG_PAGEALLOC");
400#endif
401 printk("\n");
402
403 if (notify_die(DIE_OOPS, str, regs, err,
404 current->thread.trap_no, SIGSEGV) !=
405 NOTIFY_STOP) {
406 show_registers(regs);
407 /* Executive summary in case the oops scrolled away */
408 esp = (unsigned long) (&regs->esp);
409 savesegment(ss, ss);
410 if (user_mode(regs)) {
411 esp = regs->esp;
412 ss = regs->xss & 0xffff;
413 }
414 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip);
415 print_symbol("%s", regs->eip);
416 printk(" SS:ESP %04x:%08lx\n", ss, esp);
417 }
418 else
419 regs = NULL; 450 regs = NULL;
420 } else 451 } else {
421 printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); 452 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
453 }
422 454
423 bust_spinlocks(0); 455 bust_spinlocks(0);
424 die.lock_owner = -1; 456 die.lock_owner = -1;
@@ -454,7 +486,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
454{ 486{
455 struct task_struct *tsk = current; 487 struct task_struct *tsk = current;
456 488
457 if (regs->eflags & VM_MASK) { 489 if (regs->flags & VM_MASK) {
458 if (vm86) 490 if (vm86)
459 goto vm86_trap; 491 goto vm86_trap;
460 goto trap_signal; 492 goto trap_signal;
@@ -500,7 +532,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
500} 532}
501 533
502#define DO_ERROR(trapnr, signr, str, name) \ 534#define DO_ERROR(trapnr, signr, str, name) \
503fastcall void do_##name(struct pt_regs * regs, long error_code) \ 535void do_##name(struct pt_regs * regs, long error_code) \
504{ \ 536{ \
505 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 537 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
506 == NOTIFY_STOP) \ 538 == NOTIFY_STOP) \
@@ -509,7 +541,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
509} 541}
510 542
511#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ 543#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
512fastcall void do_##name(struct pt_regs * regs, long error_code) \ 544void do_##name(struct pt_regs * regs, long error_code) \
513{ \ 545{ \
514 siginfo_t info; \ 546 siginfo_t info; \
515 if (irq) \ 547 if (irq) \
@@ -525,7 +557,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
525} 557}
526 558
527#define DO_VM86_ERROR(trapnr, signr, str, name) \ 559#define DO_VM86_ERROR(trapnr, signr, str, name) \
528fastcall void do_##name(struct pt_regs * regs, long error_code) \ 560void do_##name(struct pt_regs * regs, long error_code) \
529{ \ 561{ \
530 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 562 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
531 == NOTIFY_STOP) \ 563 == NOTIFY_STOP) \
@@ -534,26 +566,27 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \
534} 566}
535 567
536#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 568#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
537fastcall void do_##name(struct pt_regs * regs, long error_code) \ 569void do_##name(struct pt_regs * regs, long error_code) \
538{ \ 570{ \
539 siginfo_t info; \ 571 siginfo_t info; \
540 info.si_signo = signr; \ 572 info.si_signo = signr; \
541 info.si_errno = 0; \ 573 info.si_errno = 0; \
542 info.si_code = sicode; \ 574 info.si_code = sicode; \
543 info.si_addr = (void __user *)siaddr; \ 575 info.si_addr = (void __user *)siaddr; \
576 trace_hardirqs_fixup(); \
544 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 577 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
545 == NOTIFY_STOP) \ 578 == NOTIFY_STOP) \
546 return; \ 579 return; \
547 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 580 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
548} 581}
549 582
550DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) 583DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
551#ifndef CONFIG_KPROBES 584#ifndef CONFIG_KPROBES
552DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) 585DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
553#endif 586#endif
554DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) 587DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
555DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) 588DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
556DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) 589DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
557DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 590DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
558DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 591DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
559DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 592DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
@@ -561,7 +594,7 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
561DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 594DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
562DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) 595DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
563 596
564fastcall void __kprobes do_general_protection(struct pt_regs * regs, 597void __kprobes do_general_protection(struct pt_regs * regs,
565 long error_code) 598 long error_code)
566{ 599{
567 int cpu = get_cpu(); 600 int cpu = get_cpu();
@@ -595,7 +628,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
595 } 628 }
596 put_cpu(); 629 put_cpu();
597 630
598 if (regs->eflags & VM_MASK) 631 if (regs->flags & VM_MASK)
599 goto gp_in_vm86; 632 goto gp_in_vm86;
600 633
601 if (!user_mode(regs)) 634 if (!user_mode(regs))
@@ -604,11 +637,14 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs,
604 current->thread.error_code = error_code; 637 current->thread.error_code = error_code;
605 current->thread.trap_no = 13; 638 current->thread.trap_no = 13;
606 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && 639 if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
607 printk_ratelimit()) 640 printk_ratelimit()) {
608 printk(KERN_INFO 641 printk(KERN_INFO
609 "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", 642 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
610 current->comm, task_pid_nr(current), 643 current->comm, task_pid_nr(current),
611 regs->eip, regs->esp, error_code); 644 regs->ip, regs->sp, error_code);
645 print_vma_addr(" in ", regs->ip);
646 printk("\n");
647 }
612 648
613 force_sig(SIGSEGV, current); 649 force_sig(SIGSEGV, current);
614 return; 650 return;
@@ -704,8 +740,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
704 */ 740 */
705 bust_spinlocks(1); 741 bust_spinlocks(1);
706 printk(KERN_EMERG "%s", msg); 742 printk(KERN_EMERG "%s", msg);
707 printk(" on CPU%d, eip %08lx, registers:\n", 743 printk(" on CPU%d, ip %08lx, registers:\n",
708 smp_processor_id(), regs->eip); 744 smp_processor_id(), regs->ip);
709 show_registers(regs); 745 show_registers(regs);
710 console_silent(); 746 console_silent();
711 spin_unlock(&nmi_print_lock); 747 spin_unlock(&nmi_print_lock);
@@ -762,7 +798,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
762 798
763static int ignore_nmis; 799static int ignore_nmis;
764 800
765fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) 801__kprobes void do_nmi(struct pt_regs * regs, long error_code)
766{ 802{
767 int cpu; 803 int cpu;
768 804
@@ -791,7 +827,7 @@ void restart_nmi(void)
791} 827}
792 828
793#ifdef CONFIG_KPROBES 829#ifdef CONFIG_KPROBES
794fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) 830void __kprobes do_int3(struct pt_regs *regs, long error_code)
795{ 831{
796 trace_hardirqs_fixup(); 832 trace_hardirqs_fixup();
797 833
@@ -827,7 +863,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
827 * find every occurrence of the TF bit that could be saved away even 863 * find every occurrence of the TF bit that could be saved away even
828 * by user code) 864 * by user code)
829 */ 865 */
830fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) 866void __kprobes do_debug(struct pt_regs * regs, long error_code)
831{ 867{
832 unsigned int condition; 868 unsigned int condition;
833 struct task_struct *tsk = current; 869 struct task_struct *tsk = current;
@@ -836,24 +872,30 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
836 872
837 get_debugreg(condition, 6); 873 get_debugreg(condition, 6);
838 874
875 /*
876 * The processor cleared BTF, so don't mark that we need it set.
877 */
878 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
879 tsk->thread.debugctlmsr = 0;
880
839 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 881 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
840 SIGTRAP) == NOTIFY_STOP) 882 SIGTRAP) == NOTIFY_STOP)
841 return; 883 return;
842 /* It's safe to allow irq's after DR6 has been saved */ 884 /* It's safe to allow irq's after DR6 has been saved */
843 if (regs->eflags & X86_EFLAGS_IF) 885 if (regs->flags & X86_EFLAGS_IF)
844 local_irq_enable(); 886 local_irq_enable();
845 887
846 /* Mask out spurious debug traps due to lazy DR7 setting */ 888 /* Mask out spurious debug traps due to lazy DR7 setting */
847 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 889 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
848 if (!tsk->thread.debugreg[7]) 890 if (!tsk->thread.debugreg7)
849 goto clear_dr7; 891 goto clear_dr7;
850 } 892 }
851 893
852 if (regs->eflags & VM_MASK) 894 if (regs->flags & VM_MASK)
853 goto debug_vm86; 895 goto debug_vm86;
854 896
855 /* Save debug status register where ptrace can see it */ 897 /* Save debug status register where ptrace can see it */
856 tsk->thread.debugreg[6] = condition; 898 tsk->thread.debugreg6 = condition;
857 899
858 /* 900 /*
859 * Single-stepping through TF: make sure we ignore any events in 901 * Single-stepping through TF: make sure we ignore any events in
@@ -885,7 +927,7 @@ debug_vm86:
885 927
886clear_TF_reenable: 928clear_TF_reenable:
887 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 929 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
888 regs->eflags &= ~TF_MASK; 930 regs->flags &= ~TF_MASK;
889 return; 931 return;
890} 932}
891 933
@@ -894,7 +936,7 @@ clear_TF_reenable:
894 * the correct behaviour even in the presence of the asynchronous 936 * the correct behaviour even in the presence of the asynchronous
895 * IRQ13 behaviour 937 * IRQ13 behaviour
896 */ 938 */
897void math_error(void __user *eip) 939void math_error(void __user *ip)
898{ 940{
899 struct task_struct * task; 941 struct task_struct * task;
900 siginfo_t info; 942 siginfo_t info;
@@ -910,7 +952,7 @@ void math_error(void __user *eip)
910 info.si_signo = SIGFPE; 952 info.si_signo = SIGFPE;
911 info.si_errno = 0; 953 info.si_errno = 0;
912 info.si_code = __SI_FAULT; 954 info.si_code = __SI_FAULT;
913 info.si_addr = eip; 955 info.si_addr = ip;
914 /* 956 /*
915 * (~cwd & swd) will mask out exceptions that are not set to unmasked 957 * (~cwd & swd) will mask out exceptions that are not set to unmasked
916 * status. 0x3f is the exception bits in these regs, 0x200 is the 958 * status. 0x3f is the exception bits in these regs, 0x200 is the
@@ -953,13 +995,13 @@ void math_error(void __user *eip)
953 force_sig_info(SIGFPE, &info, task); 995 force_sig_info(SIGFPE, &info, task);
954} 996}
955 997
956fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) 998void do_coprocessor_error(struct pt_regs * regs, long error_code)
957{ 999{
958 ignore_fpu_irq = 1; 1000 ignore_fpu_irq = 1;
959 math_error((void __user *)regs->eip); 1001 math_error((void __user *)regs->ip);
960} 1002}
961 1003
962static void simd_math_error(void __user *eip) 1004static void simd_math_error(void __user *ip)
963{ 1005{
964 struct task_struct * task; 1006 struct task_struct * task;
965 siginfo_t info; 1007 siginfo_t info;
@@ -975,7 +1017,7 @@ static void simd_math_error(void __user *eip)
975 info.si_signo = SIGFPE; 1017 info.si_signo = SIGFPE;
976 info.si_errno = 0; 1018 info.si_errno = 0;
977 info.si_code = __SI_FAULT; 1019 info.si_code = __SI_FAULT;
978 info.si_addr = eip; 1020 info.si_addr = ip;
979 /* 1021 /*
980 * The SIMD FPU exceptions are handled a little differently, as there 1022 * The SIMD FPU exceptions are handled a little differently, as there
981 * is only a single status/control register. Thus, to determine which 1023 * is only a single status/control register. Thus, to determine which
@@ -1007,19 +1049,19 @@ static void simd_math_error(void __user *eip)
1007 force_sig_info(SIGFPE, &info, task); 1049 force_sig_info(SIGFPE, &info, task);
1008} 1050}
1009 1051
1010fastcall void do_simd_coprocessor_error(struct pt_regs * regs, 1052void do_simd_coprocessor_error(struct pt_regs * regs,
1011 long error_code) 1053 long error_code)
1012{ 1054{
1013 if (cpu_has_xmm) { 1055 if (cpu_has_xmm) {
1014 /* Handle SIMD FPU exceptions on PIII+ processors. */ 1056 /* Handle SIMD FPU exceptions on PIII+ processors. */
1015 ignore_fpu_irq = 1; 1057 ignore_fpu_irq = 1;
1016 simd_math_error((void __user *)regs->eip); 1058 simd_math_error((void __user *)regs->ip);
1017 } else { 1059 } else {
1018 /* 1060 /*
1019 * Handle strange cache flush from user space exception 1061 * Handle strange cache flush from user space exception
1020 * in all other cases. This is undocumented behaviour. 1062 * in all other cases. This is undocumented behaviour.
1021 */ 1063 */
1022 if (regs->eflags & VM_MASK) { 1064 if (regs->flags & VM_MASK) {
1023 handle_vm86_fault((struct kernel_vm86_regs *)regs, 1065 handle_vm86_fault((struct kernel_vm86_regs *)regs,
1024 error_code); 1066 error_code);
1025 return; 1067 return;
@@ -1031,7 +1073,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
1031 } 1073 }
1032} 1074}
1033 1075
1034fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, 1076void do_spurious_interrupt_bug(struct pt_regs * regs,
1035 long error_code) 1077 long error_code)
1036{ 1078{
1037#if 0 1079#if 0
@@ -1040,7 +1082,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1040#endif 1082#endif
1041} 1083}
1042 1084
1043fastcall unsigned long patch_espfix_desc(unsigned long uesp, 1085unsigned long patch_espfix_desc(unsigned long uesp,
1044 unsigned long kesp) 1086 unsigned long kesp)
1045{ 1087{
1046 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; 1088 struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
@@ -1094,51 +1136,17 @@ asmlinkage void math_emulate(long arg)
1094 1136
1095#endif /* CONFIG_MATH_EMULATION */ 1137#endif /* CONFIG_MATH_EMULATION */
1096 1138
1097/*
1098 * This needs to use 'idt_table' rather than 'idt', and
1099 * thus use the _nonmapped_ version of the IDT, as the
1100 * Pentium F0 0F bugfix can have resulted in the mapped
1101 * IDT being write-protected.
1102 */
1103void set_intr_gate(unsigned int n, void *addr)
1104{
1105 _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
1106}
1107
1108/*
1109 * This routine sets up an interrupt gate at directory privilege level 3.
1110 */
1111static inline void set_system_intr_gate(unsigned int n, void *addr)
1112{
1113 _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
1114}
1115
1116static void __init set_trap_gate(unsigned int n, void *addr)
1117{
1118 _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
1119}
1120
1121static void __init set_system_gate(unsigned int n, void *addr)
1122{
1123 _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
1124}
1125
1126static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
1127{
1128 _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
1129}
1130
1131 1139
1132void __init trap_init(void) 1140void __init trap_init(void)
1133{ 1141{
1134 int i; 1142 int i;
1135 1143
1136#ifdef CONFIG_EISA 1144#ifdef CONFIG_EISA
1137 void __iomem *p = ioremap(0x0FFFD9, 4); 1145 void __iomem *p = early_ioremap(0x0FFFD9, 4);
1138 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { 1146 if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
1139 EISA_bus = 1; 1147 EISA_bus = 1;
1140 } 1148 }
1141 iounmap(p); 1149 early_iounmap(p, 4);
1142#endif 1150#endif
1143 1151
1144#ifdef CONFIG_X86_LOCAL_APIC 1152#ifdef CONFIG_X86_LOCAL_APIC
@@ -1168,17 +1176,12 @@ void __init trap_init(void)
1168#endif 1176#endif
1169 set_trap_gate(19,&simd_coprocessor_error); 1177 set_trap_gate(19,&simd_coprocessor_error);
1170 1178
1179 /*
1180 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
1181 * Generate a build-time error if the alignment is wrong.
1182 */
1183 BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
1171 if (cpu_has_fxsr) { 1184 if (cpu_has_fxsr) {
1172 /*
1173 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
1174 * Generates a compile-time "error: zero width for bit-field" if
1175 * the alignment is wrong.
1176 */
1177 struct fxsrAlignAssert {
1178 int _:!(offsetof(struct task_struct,
1179 thread.i387.fxsave) & 15);
1180 };
1181
1182 printk(KERN_INFO "Enabling fast FPU save and restore... "); 1185 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1183 set_in_cr4(X86_CR4_OSFXSR); 1186 set_in_cr4(X86_CR4_OSFXSR);
1184 printk("done.\n"); 1187 printk("done.\n");
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index d11525ad81b4..efc66df728b6 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -74,22 +74,24 @@ asmlinkage void alignment_check(void);
74asmlinkage void machine_check(void); 74asmlinkage void machine_check(void);
75asmlinkage void spurious_interrupt_bug(void); 75asmlinkage void spurious_interrupt_bug(void);
76 76
77static unsigned int code_bytes = 64;
78
77static inline void conditional_sti(struct pt_regs *regs) 79static inline void conditional_sti(struct pt_regs *regs)
78{ 80{
79 if (regs->eflags & X86_EFLAGS_IF) 81 if (regs->flags & X86_EFLAGS_IF)
80 local_irq_enable(); 82 local_irq_enable();
81} 83}
82 84
83static inline void preempt_conditional_sti(struct pt_regs *regs) 85static inline void preempt_conditional_sti(struct pt_regs *regs)
84{ 86{
85 preempt_disable(); 87 preempt_disable();
86 if (regs->eflags & X86_EFLAGS_IF) 88 if (regs->flags & X86_EFLAGS_IF)
87 local_irq_enable(); 89 local_irq_enable();
88} 90}
89 91
90static inline void preempt_conditional_cli(struct pt_regs *regs) 92static inline void preempt_conditional_cli(struct pt_regs *regs)
91{ 93{
92 if (regs->eflags & X86_EFLAGS_IF) 94 if (regs->flags & X86_EFLAGS_IF)
93 local_irq_disable(); 95 local_irq_disable();
94 /* Make sure to not schedule here because we could be running 96 /* Make sure to not schedule here because we could be running
95 on an exception stack. */ 97 on an exception stack. */
@@ -98,14 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
98 100
99int kstack_depth_to_print = 12; 101int kstack_depth_to_print = 12;
100 102
101#ifdef CONFIG_KALLSYMS 103void printk_address(unsigned long address, int reliable)
102void printk_address(unsigned long address)
103{ 104{
105#ifdef CONFIG_KALLSYMS
104 unsigned long offset = 0, symsize; 106 unsigned long offset = 0, symsize;
105 const char *symname; 107 const char *symname;
106 char *modname; 108 char *modname;
107 char *delim = ":"; 109 char *delim = ":";
108 char namebuf[128]; 110 char namebuf[KSYM_NAME_LEN];
111 char reliab[4] = "";
109 112
110 symname = kallsyms_lookup(address, &symsize, &offset, 113 symname = kallsyms_lookup(address, &symsize, &offset,
111 &modname, namebuf); 114 &modname, namebuf);
@@ -113,17 +116,17 @@ void printk_address(unsigned long address)
113 printk(" [<%016lx>]\n", address); 116 printk(" [<%016lx>]\n", address);
114 return; 117 return;
115 } 118 }
119 if (!reliable)
120 strcpy(reliab, "? ");
121
116 if (!modname) 122 if (!modname)
117 modname = delim = ""; 123 modname = delim = "";
118 printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", 124 printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
119 address, delim, modname, delim, symname, offset, symsize); 125 address, reliab, delim, modname, delim, symname, offset, symsize);
120}
121#else 126#else
122void printk_address(unsigned long address)
123{
124 printk(" [<%016lx>]\n", address); 127 printk(" [<%016lx>]\n", address);
125}
126#endif 128#endif
129}
127 130
128static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 131static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
129 unsigned *usedp, char **idp) 132 unsigned *usedp, char **idp)
@@ -208,14 +211,53 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
208 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 211 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
209 */ 212 */
210 213
211static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) 214static inline int valid_stack_ptr(struct thread_info *tinfo,
215 void *p, unsigned int size, void *end)
216{
217 void *t = tinfo;
218 if (end) {
219 if (p < end && p >= (end-THREAD_SIZE))
220 return 1;
221 else
222 return 0;
223 }
224 return p > t && p < t + THREAD_SIZE - size;
225}
226
227/* The form of the top of the frame on the stack */
228struct stack_frame {
229 struct stack_frame *next_frame;
230 unsigned long return_address;
231};
232
233
234static inline unsigned long print_context_stack(struct thread_info *tinfo,
235 unsigned long *stack, unsigned long bp,
236 const struct stacktrace_ops *ops, void *data,
237 unsigned long *end)
212{ 238{
213 void *t = (void *)tinfo; 239 struct stack_frame *frame = (struct stack_frame *)bp;
214 return p > t && p < t + THREAD_SIZE - 3; 240
241 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
242 unsigned long addr;
243
244 addr = *stack;
245 if (__kernel_text_address(addr)) {
246 if ((unsigned long) stack == bp + 8) {
247 ops->address(data, addr, 1);
248 frame = frame->next_frame;
249 bp = (unsigned long) frame;
250 } else {
251 ops->address(data, addr, bp == 0);
252 }
253 }
254 stack++;
255 }
256 return bp;
215} 257}
216 258
217void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 259void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
218 unsigned long *stack, 260 unsigned long *stack, unsigned long bp,
219 const struct stacktrace_ops *ops, void *data) 261 const struct stacktrace_ops *ops, void *data)
220{ 262{
221 const unsigned cpu = get_cpu(); 263 const unsigned cpu = get_cpu();
@@ -225,36 +267,28 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
225 267
226 if (!tsk) 268 if (!tsk)
227 tsk = current; 269 tsk = current;
270 tinfo = task_thread_info(tsk);
228 271
229 if (!stack) { 272 if (!stack) {
230 unsigned long dummy; 273 unsigned long dummy;
231 stack = &dummy; 274 stack = &dummy;
232 if (tsk && tsk != current) 275 if (tsk && tsk != current)
233 stack = (unsigned long *)tsk->thread.rsp; 276 stack = (unsigned long *)tsk->thread.sp;
234 } 277 }
235 278
236 /* 279#ifdef CONFIG_FRAME_POINTER
237 * Print function call entries within a stack. 'cond' is the 280 if (!bp) {
238 * "end of stackframe" condition, that the 'stack++' 281 if (tsk == current) {
239 * iteration will eventually trigger. 282 /* Grab bp right from our regs */
240 */ 283 asm("movq %%rbp, %0" : "=r" (bp):);
241#define HANDLE_STACK(cond) \ 284 } else {
242 do while (cond) { \ 285 /* bp is the last reg pushed by switch_to */
243 unsigned long addr = *stack++; \ 286 bp = *(unsigned long *) tsk->thread.sp;
244 /* Use unlocked access here because except for NMIs \ 287 }
245 we should be already protected against module unloads */ \ 288 }
246 if (__kernel_text_address(addr)) { \ 289#endif
247 /* \ 290
248 * If the address is either in the text segment of the \ 291
249 * kernel, or in the region which contains vmalloc'ed \
250 * memory, it *may* be the address of a calling \
251 * routine; if so, print it so that someone tracing \
252 * down the cause of the crash will be able to figure \
253 * out the call path that was taken. \
254 */ \
255 ops->address(data, addr); \
256 } \
257 } while (0)
258 292
259 /* 293 /*
260 * Print function call entries in all stacks, starting at the 294 * Print function call entries in all stacks, starting at the
@@ -270,7 +304,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
270 if (estack_end) { 304 if (estack_end) {
271 if (ops->stack(data, id) < 0) 305 if (ops->stack(data, id) < 0)
272 break; 306 break;
273 HANDLE_STACK (stack < estack_end); 307
308 bp = print_context_stack(tinfo, stack, bp, ops,
309 data, estack_end);
274 ops->stack(data, "<EOE>"); 310 ops->stack(data, "<EOE>");
275 /* 311 /*
276 * We link to the next stack via the 312 * We link to the next stack via the
@@ -288,7 +324,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
288 if (stack >= irqstack && stack < irqstack_end) { 324 if (stack >= irqstack && stack < irqstack_end) {
289 if (ops->stack(data, "IRQ") < 0) 325 if (ops->stack(data, "IRQ") < 0)
290 break; 326 break;
291 HANDLE_STACK (stack < irqstack_end); 327 bp = print_context_stack(tinfo, stack, bp,
328 ops, data, irqstack_end);
292 /* 329 /*
293 * We link to the next stack (which would be 330 * We link to the next stack (which would be
294 * the process stack normally) the last 331 * the process stack normally) the last
@@ -306,9 +343,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
306 /* 343 /*
307 * This handles the process stack: 344 * This handles the process stack:
308 */ 345 */
309 tinfo = task_thread_info(tsk); 346 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
310 HANDLE_STACK (valid_stack_ptr(tinfo, stack));
311#undef HANDLE_STACK
312 put_cpu(); 347 put_cpu();
313} 348}
314EXPORT_SYMBOL(dump_trace); 349EXPORT_SYMBOL(dump_trace);
@@ -331,10 +366,10 @@ static int print_trace_stack(void *data, char *name)
331 return 0; 366 return 0;
332} 367}
333 368
334static void print_trace_address(void *data, unsigned long addr) 369static void print_trace_address(void *data, unsigned long addr, int reliable)
335{ 370{
336 touch_nmi_watchdog(); 371 touch_nmi_watchdog();
337 printk_address(addr); 372 printk_address(addr, reliable);
338} 373}
339 374
340static const struct stacktrace_ops print_trace_ops = { 375static const struct stacktrace_ops print_trace_ops = {
@@ -345,15 +380,17 @@ static const struct stacktrace_ops print_trace_ops = {
345}; 380};
346 381
347void 382void
348show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) 383show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
384 unsigned long bp)
349{ 385{
350 printk("\nCall Trace:\n"); 386 printk("\nCall Trace:\n");
351 dump_trace(tsk, regs, stack, &print_trace_ops, NULL); 387 dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
352 printk("\n"); 388 printk("\n");
353} 389}
354 390
355static void 391static void
356_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) 392_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
393 unsigned long bp)
357{ 394{
358 unsigned long *stack; 395 unsigned long *stack;
359 int i; 396 int i;
@@ -364,14 +401,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
364 // debugging aid: "show_stack(NULL, NULL);" prints the 401 // debugging aid: "show_stack(NULL, NULL);" prints the
365 // back trace for this cpu. 402 // back trace for this cpu.
366 403
367 if (rsp == NULL) { 404 if (sp == NULL) {
368 if (tsk) 405 if (tsk)
369 rsp = (unsigned long *)tsk->thread.rsp; 406 sp = (unsigned long *)tsk->thread.sp;
370 else 407 else
371 rsp = (unsigned long *)&rsp; 408 sp = (unsigned long *)&sp;
372 } 409 }
373 410
374 stack = rsp; 411 stack = sp;
375 for(i=0; i < kstack_depth_to_print; i++) { 412 for(i=0; i < kstack_depth_to_print; i++) {
376 if (stack >= irqstack && stack <= irqstack_end) { 413 if (stack >= irqstack && stack <= irqstack_end) {
377 if (stack == irqstack_end) { 414 if (stack == irqstack_end) {
@@ -387,12 +424,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
387 printk(" %016lx", *stack++); 424 printk(" %016lx", *stack++);
388 touch_nmi_watchdog(); 425 touch_nmi_watchdog();
389 } 426 }
390 show_trace(tsk, regs, rsp); 427 show_trace(tsk, regs, sp, bp);
391} 428}
392 429
393void show_stack(struct task_struct *tsk, unsigned long * rsp) 430void show_stack(struct task_struct *tsk, unsigned long * sp)
394{ 431{
395 _show_stack(tsk, NULL, rsp); 432 _show_stack(tsk, NULL, sp, 0);
396} 433}
397 434
398/* 435/*
@@ -401,13 +438,19 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
401void dump_stack(void) 438void dump_stack(void)
402{ 439{
403 unsigned long dummy; 440 unsigned long dummy;
441 unsigned long bp = 0;
442
443#ifdef CONFIG_FRAME_POINTER
444 if (!bp)
445 asm("movq %%rbp, %0" : "=r" (bp):);
446#endif
404 447
405 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 448 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
406 current->pid, current->comm, print_tainted(), 449 current->pid, current->comm, print_tainted(),
407 init_utsname()->release, 450 init_utsname()->release,
408 (int)strcspn(init_utsname()->version, " "), 451 (int)strcspn(init_utsname()->version, " "),
409 init_utsname()->version); 452 init_utsname()->version);
410 show_trace(NULL, NULL, &dummy); 453 show_trace(NULL, NULL, &dummy, bp);
411} 454}
412 455
413EXPORT_SYMBOL(dump_stack); 456EXPORT_SYMBOL(dump_stack);
@@ -415,12 +458,15 @@ EXPORT_SYMBOL(dump_stack);
415void show_registers(struct pt_regs *regs) 458void show_registers(struct pt_regs *regs)
416{ 459{
417 int i; 460 int i;
418 int in_kernel = !user_mode(regs); 461 unsigned long sp;
419 unsigned long rsp;
420 const int cpu = smp_processor_id(); 462 const int cpu = smp_processor_id();
421 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 463 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
464 u8 *ip;
465 unsigned int code_prologue = code_bytes * 43 / 64;
466 unsigned int code_len = code_bytes;
422 467
423 rsp = regs->rsp; 468 sp = regs->sp;
469 ip = (u8 *) regs->ip - code_prologue;
424 printk("CPU %d ", cpu); 470 printk("CPU %d ", cpu);
425 __show_regs(regs); 471 __show_regs(regs);
426 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 472 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
@@ -430,45 +476,43 @@ void show_registers(struct pt_regs *regs)
430 * When in-kernel, we also print out the stack and code at the 476 * When in-kernel, we also print out the stack and code at the
431 * time of the fault.. 477 * time of the fault..
432 */ 478 */
433 if (in_kernel) { 479 if (!user_mode(regs)) {
480 unsigned char c;
434 printk("Stack: "); 481 printk("Stack: ");
435 _show_stack(NULL, regs, (unsigned long*)rsp); 482 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
436 483 printk("\n");
437 printk("\nCode: "); 484
438 if (regs->rip < PAGE_OFFSET) 485 printk(KERN_EMERG "Code: ");
439 goto bad; 486 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
440 487 /* try starting at RIP */
441 for (i=0; i<20; i++) { 488 ip = (u8 *) regs->ip;
442 unsigned char c; 489 code_len = code_len - code_prologue + 1;
443 if (__get_user(c, &((unsigned char*)regs->rip)[i])) { 490 }
444bad: 491 for (i = 0; i < code_len; i++, ip++) {
492 if (ip < (u8 *)PAGE_OFFSET ||
493 probe_kernel_address(ip, c)) {
445 printk(" Bad RIP value."); 494 printk(" Bad RIP value.");
446 break; 495 break;
447 } 496 }
448 printk("%02x ", c); 497 if (ip == (u8 *)regs->ip)
498 printk("<%02x> ", c);
499 else
500 printk("%02x ", c);
449 } 501 }
450 } 502 }
451 printk("\n"); 503 printk("\n");
452} 504}
453 505
454int is_valid_bugaddr(unsigned long rip) 506int is_valid_bugaddr(unsigned long ip)
455{ 507{
456 unsigned short ud2; 508 unsigned short ud2;
457 509
458 if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) 510 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
459 return 0; 511 return 0;
460 512
461 return ud2 == 0x0b0f; 513 return ud2 == 0x0b0f;
462} 514}
463 515
464#ifdef CONFIG_BUG
465void out_of_line_bug(void)
466{
467 BUG();
468}
469EXPORT_SYMBOL(out_of_line_bug);
470#endif
471
472static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 516static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
473static int die_owner = -1; 517static int die_owner = -1;
474static unsigned int die_nest_count; 518static unsigned int die_nest_count;
@@ -496,7 +540,7 @@ unsigned __kprobes long oops_begin(void)
496 return flags; 540 return flags;
497} 541}
498 542
499void __kprobes oops_end(unsigned long flags) 543void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
500{ 544{
501 die_owner = -1; 545 die_owner = -1;
502 bust_spinlocks(0); 546 bust_spinlocks(0);
@@ -505,12 +549,17 @@ void __kprobes oops_end(unsigned long flags)
505 /* Nest count reaches zero, release the lock. */ 549 /* Nest count reaches zero, release the lock. */
506 __raw_spin_unlock(&die_lock); 550 __raw_spin_unlock(&die_lock);
507 raw_local_irq_restore(flags); 551 raw_local_irq_restore(flags);
552 if (!regs) {
553 oops_exit();
554 return;
555 }
508 if (panic_on_oops) 556 if (panic_on_oops)
509 panic("Fatal exception"); 557 panic("Fatal exception");
510 oops_exit(); 558 oops_exit();
559 do_exit(signr);
511} 560}
512 561
513void __kprobes __die(const char * str, struct pt_regs * regs, long err) 562int __kprobes __die(const char * str, struct pt_regs * regs, long err)
514{ 563{
515 static int die_counter; 564 static int die_counter;
516 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); 565 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter);
@@ -524,15 +573,17 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
524 printk("DEBUG_PAGEALLOC"); 573 printk("DEBUG_PAGEALLOC");
525#endif 574#endif
526 printk("\n"); 575 printk("\n");
527 notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); 576 if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
577 return 1;
528 show_registers(regs); 578 show_registers(regs);
529 add_taint(TAINT_DIE); 579 add_taint(TAINT_DIE);
530 /* Executive summary in case the oops scrolled away */ 580 /* Executive summary in case the oops scrolled away */
531 printk(KERN_ALERT "RIP "); 581 printk(KERN_ALERT "RIP ");
532 printk_address(regs->rip); 582 printk_address(regs->ip, 1);
533 printk(" RSP <%016lx>\n", regs->rsp); 583 printk(" RSP <%016lx>\n", regs->sp);
534 if (kexec_should_crash(current)) 584 if (kexec_should_crash(current))
535 crash_kexec(regs); 585 crash_kexec(regs);
586 return 0;
536} 587}
537 588
538void die(const char * str, struct pt_regs * regs, long err) 589void die(const char * str, struct pt_regs * regs, long err)
@@ -540,11 +591,11 @@ void die(const char * str, struct pt_regs * regs, long err)
540 unsigned long flags = oops_begin(); 591 unsigned long flags = oops_begin();
541 592
542 if (!user_mode(regs)) 593 if (!user_mode(regs))
543 report_bug(regs->rip, regs); 594 report_bug(regs->ip, regs);
544 595
545 __die(str, regs, err); 596 if (__die(str, regs, err))
546 oops_end(flags); 597 regs = NULL;
547 do_exit(SIGSEGV); 598 oops_end(flags, regs, SIGSEGV);
548} 599}
549 600
550void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) 601void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
@@ -561,10 +612,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
561 crash_kexec(regs); 612 crash_kexec(regs);
562 if (do_panic || panic_on_oops) 613 if (do_panic || panic_on_oops)
563 panic("Non maskable interrupt"); 614 panic("Non maskable interrupt");
564 oops_end(flags); 615 oops_end(flags, NULL, SIGBUS);
565 nmi_exit(); 616 nmi_exit();
566 local_irq_enable(); 617 local_irq_enable();
567 do_exit(SIGSEGV); 618 do_exit(SIGBUS);
568} 619}
569 620
570static void __kprobes do_trap(int trapnr, int signr, char *str, 621static void __kprobes do_trap(int trapnr, int signr, char *str,
@@ -588,11 +639,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
588 tsk->thread.trap_no = trapnr; 639 tsk->thread.trap_no = trapnr;
589 640
590 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 641 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
591 printk_ratelimit()) 642 printk_ratelimit()) {
592 printk(KERN_INFO 643 printk(KERN_INFO
593 "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", 644 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
594 tsk->comm, tsk->pid, str, 645 tsk->comm, tsk->pid, str,
595 regs->rip, regs->rsp, error_code); 646 regs->ip, regs->sp, error_code);
647 print_vma_addr(" in ", regs->ip);
648 printk("\n");
649 }
596 650
597 if (info) 651 if (info)
598 force_sig_info(signr, info, tsk); 652 force_sig_info(signr, info, tsk);
@@ -602,19 +656,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
602 } 656 }
603 657
604 658
605 /* kernel trap */ 659 if (!fixup_exception(regs)) {
606 { 660 tsk->thread.error_code = error_code;
607 const struct exception_table_entry *fixup; 661 tsk->thread.trap_no = trapnr;
608 fixup = search_exception_tables(regs->rip); 662 die(str, regs, error_code);
609 if (fixup)
610 regs->rip = fixup->fixup;
611 else {
612 tsk->thread.error_code = error_code;
613 tsk->thread.trap_no = trapnr;
614 die(str, regs, error_code);
615 }
616 return;
617 } 663 }
664 return;
618} 665}
619 666
620#define DO_ERROR(trapnr, signr, str, name) \ 667#define DO_ERROR(trapnr, signr, str, name) \
@@ -635,6 +682,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
635 info.si_errno = 0; \ 682 info.si_errno = 0; \
636 info.si_code = sicode; \ 683 info.si_code = sicode; \
637 info.si_addr = (void __user *)siaddr; \ 684 info.si_addr = (void __user *)siaddr; \
685 trace_hardirqs_fixup(); \
638 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 686 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
639 == NOTIFY_STOP) \ 687 == NOTIFY_STOP) \
640 return; \ 688 return; \
@@ -642,10 +690,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
642 do_trap(trapnr, signr, str, regs, error_code, &info); \ 690 do_trap(trapnr, signr, str, regs, error_code, &info); \
643} 691}
644 692
645DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) 693DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
646DO_ERROR( 4, SIGSEGV, "overflow", overflow) 694DO_ERROR( 4, SIGSEGV, "overflow", overflow)
647DO_ERROR( 5, SIGSEGV, "bounds", bounds) 695DO_ERROR( 5, SIGSEGV, "bounds", bounds)
648DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) 696DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
649DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) 697DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
650DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 698DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
651DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 699DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
@@ -693,32 +741,28 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
693 tsk->thread.trap_no = 13; 741 tsk->thread.trap_no = 13;
694 742
695 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 743 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
696 printk_ratelimit()) 744 printk_ratelimit()) {
697 printk(KERN_INFO 745 printk(KERN_INFO
698 "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", 746 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
699 tsk->comm, tsk->pid, 747 tsk->comm, tsk->pid,
700 regs->rip, regs->rsp, error_code); 748 regs->ip, regs->sp, error_code);
749 print_vma_addr(" in ", regs->ip);
750 printk("\n");
751 }
701 752
702 force_sig(SIGSEGV, tsk); 753 force_sig(SIGSEGV, tsk);
703 return; 754 return;
704 } 755 }
705 756
706 /* kernel gp */ 757 if (fixup_exception(regs))
707 { 758 return;
708 const struct exception_table_entry *fixup;
709 fixup = search_exception_tables(regs->rip);
710 if (fixup) {
711 regs->rip = fixup->fixup;
712 return;
713 }
714 759
715 tsk->thread.error_code = error_code; 760 tsk->thread.error_code = error_code;
716 tsk->thread.trap_no = 13; 761 tsk->thread.trap_no = 13;
717 if (notify_die(DIE_GPF, "general protection fault", regs, 762 if (notify_die(DIE_GPF, "general protection fault", regs,
718 error_code, 13, SIGSEGV) == NOTIFY_STOP) 763 error_code, 13, SIGSEGV) == NOTIFY_STOP)
719 return; 764 return;
720 die("general protection fault", regs, error_code); 765 die("general protection fault", regs, error_code);
721 }
722} 766}
723 767
724static __kprobes void 768static __kprobes void
@@ -831,15 +875,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
831{ 875{
832 struct pt_regs *regs = eregs; 876 struct pt_regs *regs = eregs;
833 /* Did already sync */ 877 /* Did already sync */
834 if (eregs == (struct pt_regs *)eregs->rsp) 878 if (eregs == (struct pt_regs *)eregs->sp)
835 ; 879 ;
836 /* Exception from user space */ 880 /* Exception from user space */
837 else if (user_mode(eregs)) 881 else if (user_mode(eregs))
838 regs = task_pt_regs(current); 882 regs = task_pt_regs(current);
839 /* Exception from kernel and interrupts are enabled. Move to 883 /* Exception from kernel and interrupts are enabled. Move to
840 kernel process stack. */ 884 kernel process stack. */
841 else if (eregs->eflags & X86_EFLAGS_IF) 885 else if (eregs->flags & X86_EFLAGS_IF)
842 regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); 886 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
843 if (eregs != regs) 887 if (eregs != regs)
844 *regs = *eregs; 888 *regs = *eregs;
845 return regs; 889 return regs;
@@ -857,6 +901,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
857 901
858 get_debugreg(condition, 6); 902 get_debugreg(condition, 6);
859 903
904 /*
905 * The processor cleared BTF, so don't mark that we need it set.
906 */
907 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
908 tsk->thread.debugctlmsr = 0;
909
860 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 910 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
861 SIGTRAP) == NOTIFY_STOP) 911 SIGTRAP) == NOTIFY_STOP)
862 return; 912 return;
@@ -872,27 +922,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
872 922
873 tsk->thread.debugreg6 = condition; 923 tsk->thread.debugreg6 = condition;
874 924
875 /* Mask out spurious TF errors due to lazy TF clearing */ 925
926 /*
927 * Single-stepping through TF: make sure we ignore any events in
928 * kernel space (but re-enable TF when returning to user mode).
929 */
876 if (condition & DR_STEP) { 930 if (condition & DR_STEP) {
877 /*
878 * The TF error should be masked out only if the current
879 * process is not traced and if the TRAP flag has been set
880 * previously by a tracing process (condition detected by
881 * the PT_DTRACE flag); remember that the i386 TRAP flag
882 * can be modified by the process itself in user mode,
883 * allowing programs to debug themselves without the ptrace()
884 * interface.
885 */
886 if (!user_mode(regs)) 931 if (!user_mode(regs))
887 goto clear_TF_reenable; 932 goto clear_TF_reenable;
888 /*
889 * Was the TF flag set by a debugger? If so, clear it now,
890 * so that register information is correct.
891 */
892 if (tsk->ptrace & PT_DTRACE) {
893 regs->eflags &= ~TF_MASK;
894 tsk->ptrace &= ~PT_DTRACE;
895 }
896 } 933 }
897 934
898 /* Ok, finally something we can handle */ 935 /* Ok, finally something we can handle */
@@ -901,7 +938,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs,
901 info.si_signo = SIGTRAP; 938 info.si_signo = SIGTRAP;
902 info.si_errno = 0; 939 info.si_errno = 0;
903 info.si_code = TRAP_BRKPT; 940 info.si_code = TRAP_BRKPT;
904 info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; 941 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
905 force_sig_info(SIGTRAP, &info, tsk); 942 force_sig_info(SIGTRAP, &info, tsk);
906 943
907clear_dr7: 944clear_dr7:
@@ -911,18 +948,15 @@ clear_dr7:
911 948
912clear_TF_reenable: 949clear_TF_reenable:
913 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 950 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
914 regs->eflags &= ~TF_MASK; 951 regs->flags &= ~X86_EFLAGS_TF;
915 preempt_conditional_cli(regs); 952 preempt_conditional_cli(regs);
916} 953}
917 954
918static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) 955static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
919{ 956{
920 const struct exception_table_entry *fixup; 957 if (fixup_exception(regs))
921 fixup = search_exception_tables(regs->rip);
922 if (fixup) {
923 regs->rip = fixup->fixup;
924 return 1; 958 return 1;
925 } 959
926 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); 960 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
927 /* Illegal floating point operation in the kernel */ 961 /* Illegal floating point operation in the kernel */
928 current->thread.trap_no = trapnr; 962 current->thread.trap_no = trapnr;
@@ -937,7 +971,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
937 */ 971 */
938asmlinkage void do_coprocessor_error(struct pt_regs *regs) 972asmlinkage void do_coprocessor_error(struct pt_regs *regs)
939{ 973{
940 void __user *rip = (void __user *)(regs->rip); 974 void __user *ip = (void __user *)(regs->ip);
941 struct task_struct * task; 975 struct task_struct * task;
942 siginfo_t info; 976 siginfo_t info;
943 unsigned short cwd, swd; 977 unsigned short cwd, swd;
@@ -957,7 +991,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs)
957 info.si_signo = SIGFPE; 991 info.si_signo = SIGFPE;
958 info.si_errno = 0; 992 info.si_errno = 0;
959 info.si_code = __SI_FAULT; 993 info.si_code = __SI_FAULT;
960 info.si_addr = rip; 994 info.si_addr = ip;
961 /* 995 /*
962 * (~cwd & swd) will mask out exceptions that are not set to unmasked 996 * (~cwd & swd) will mask out exceptions that are not set to unmasked
963 * status. 0x3f is the exception bits in these regs, 0x200 is the 997 * status. 0x3f is the exception bits in these regs, 0x200 is the
@@ -1006,7 +1040,7 @@ asmlinkage void bad_intr(void)
1006 1040
1007asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) 1041asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1008{ 1042{
1009 void __user *rip = (void __user *)(regs->rip); 1043 void __user *ip = (void __user *)(regs->ip);
1010 struct task_struct * task; 1044 struct task_struct * task;
1011 siginfo_t info; 1045 siginfo_t info;
1012 unsigned short mxcsr; 1046 unsigned short mxcsr;
@@ -1026,7 +1060,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1026 info.si_signo = SIGFPE; 1060 info.si_signo = SIGFPE;
1027 info.si_errno = 0; 1061 info.si_errno = 0;
1028 info.si_code = __SI_FAULT; 1062 info.si_code = __SI_FAULT;
1029 info.si_addr = rip; 1063 info.si_addr = ip;
1030 /* 1064 /*
1031 * The SIMD FPU exceptions are handled a little differently, as there 1065 * The SIMD FPU exceptions are handled a little differently, as there
1032 * is only a single status/control register. Thus, to determine which 1066 * is only a single status/control register. Thus, to determine which
@@ -1088,6 +1122,7 @@ asmlinkage void math_state_restore(void)
1088 task_thread_info(me)->status |= TS_USEDFPU; 1122 task_thread_info(me)->status |= TS_USEDFPU;
1089 me->fpu_counter++; 1123 me->fpu_counter++;
1090} 1124}
1125EXPORT_SYMBOL_GPL(math_state_restore);
1091 1126
1092void __init trap_init(void) 1127void __init trap_init(void)
1093{ 1128{
@@ -1143,3 +1178,14 @@ static int __init kstack_setup(char *s)
1143 return 0; 1178 return 0;
1144} 1179}
1145early_param("kstack", kstack_setup); 1180early_param("kstack", kstack_setup);
1181
1182
1183static int __init code_bytes_setup(char *s)
1184{
1185 code_bytes = simple_strtoul(s, NULL, 0);
1186 if (code_bytes > 8192)
1187 code_bytes = 8192;
1188
1189 return 1;
1190}
1191__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 9ebc0dab66b4..43517e324be8 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -5,6 +5,7 @@
5#include <linux/jiffies.h> 5#include <linux/jiffies.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmi.h> 7#include <linux/dmi.h>
8#include <linux/percpu.h>
8 9
9#include <asm/delay.h> 10#include <asm/delay.h>
10#include <asm/tsc.h> 11#include <asm/tsc.h>
@@ -23,8 +24,6 @@ static int tsc_enabled;
23unsigned int tsc_khz; 24unsigned int tsc_khz;
24EXPORT_SYMBOL_GPL(tsc_khz); 25EXPORT_SYMBOL_GPL(tsc_khz);
25 26
26int tsc_disable;
27
28#ifdef CONFIG_X86_TSC 27#ifdef CONFIG_X86_TSC
29static int __init tsc_setup(char *str) 28static int __init tsc_setup(char *str)
30{ 29{
@@ -39,8 +38,7 @@ static int __init tsc_setup(char *str)
39 */ 38 */
40static int __init tsc_setup(char *str) 39static int __init tsc_setup(char *str)
41{ 40{
42 tsc_disable = 1; 41 setup_clear_cpu_cap(X86_FEATURE_TSC);
43
44 return 1; 42 return 1;
45} 43}
46#endif 44#endif
@@ -80,13 +78,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
80 * 78 *
81 * -johnstul@us.ibm.com "math is hard, lets go shopping!" 79 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
82 */ 80 */
83unsigned long cyc2ns_scale __read_mostly;
84 81
85#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ 82DEFINE_PER_CPU(unsigned long, cyc2ns);
86 83
87static inline void set_cyc2ns_scale(unsigned long cpu_khz) 84static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
88{ 85{
89 cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; 86 unsigned long flags, prev_scale, *scale;
87 unsigned long long tsc_now, ns_now;
88
89 local_irq_save(flags);
90 sched_clock_idle_sleep_event();
91
92 scale = &per_cpu(cyc2ns, cpu);
93
94 rdtscll(tsc_now);
95 ns_now = __cycles_2_ns(tsc_now);
96
97 prev_scale = *scale;
98 if (cpu_khz)
99 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
100
101 /*
102 * Start smoothly with the new frequency:
103 */
104 sched_clock_idle_wakeup_event(0);
105 local_irq_restore(flags);
90} 106}
91 107
92/* 108/*
@@ -239,7 +255,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
239 ref_freq, freq->new); 255 ref_freq, freq->new);
240 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { 256 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
241 tsc_khz = cpu_khz; 257 tsc_khz = cpu_khz;
242 set_cyc2ns_scale(cpu_khz); 258 preempt_disable();
259 set_cyc2ns_scale(cpu_khz, smp_processor_id());
260 preempt_enable();
243 /* 261 /*
244 * TSC based sched_clock turns 262 * TSC based sched_clock turns
245 * to junk w/ cpufreq 263 * to junk w/ cpufreq
@@ -333,6 +351,11 @@ __cpuinit int unsynchronized_tsc(void)
333{ 351{
334 if (!cpu_has_tsc || tsc_unstable) 352 if (!cpu_has_tsc || tsc_unstable)
335 return 1; 353 return 1;
354
355 /* Anything with constant TSC should be synchronized */
356 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
357 return 0;
358
336 /* 359 /*
337 * Intel systems are normally all synchronized. 360 * Intel systems are normally all synchronized.
338 * Exceptions must mark TSC as unstable: 361 * Exceptions must mark TSC as unstable:
@@ -367,7 +390,9 @@ static inline void check_geode_tsc_reliable(void) { }
367 390
368void __init tsc_init(void) 391void __init tsc_init(void)
369{ 392{
370 if (!cpu_has_tsc || tsc_disable) 393 int cpu;
394
395 if (!cpu_has_tsc)
371 goto out_no_tsc; 396 goto out_no_tsc;
372 397
373 cpu_khz = calculate_cpu_khz(); 398 cpu_khz = calculate_cpu_khz();
@@ -380,7 +405,15 @@ void __init tsc_init(void)
380 (unsigned long)cpu_khz / 1000, 405 (unsigned long)cpu_khz / 1000,
381 (unsigned long)cpu_khz % 1000); 406 (unsigned long)cpu_khz % 1000);
382 407
383 set_cyc2ns_scale(cpu_khz); 408 /*
409 * Secondary CPUs do not run through tsc_init(), so set up
410 * all the scale factors for all CPUs, assuming the same
411 * speed as the bootup CPU. (cpufreq notifiers will fix this
412 * up if their speed diverges)
413 */
414 for_each_possible_cpu(cpu)
415 set_cyc2ns_scale(cpu_khz, cpu);
416
384 use_tsc_delay(); 417 use_tsc_delay();
385 418
386 /* Check and install the TSC clocksource */ 419 /* Check and install the TSC clocksource */
@@ -403,10 +436,5 @@ void __init tsc_init(void)
403 return; 436 return;
404 437
405out_no_tsc: 438out_no_tsc:
406 /* 439 setup_clear_cpu_cap(X86_FEATURE_TSC);
407 * Set the tsc_disable flag if there's no TSC support, this
408 * makes it a fast flag for the kernel to see whether it
409 * should be using the TSC.
410 */
411 tsc_disable = 1;
412} 440}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 9c70af45b42b..947554ddabb6 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -10,6 +10,7 @@
10 10
11#include <asm/hpet.h> 11#include <asm/hpet.h>
12#include <asm/timex.h> 12#include <asm/timex.h>
13#include <asm/timer.h>
13 14
14static int notsc __initdata = 0; 15static int notsc __initdata = 0;
15 16
@@ -18,19 +19,51 @@ EXPORT_SYMBOL(cpu_khz);
18unsigned int tsc_khz; 19unsigned int tsc_khz;
19EXPORT_SYMBOL(tsc_khz); 20EXPORT_SYMBOL(tsc_khz);
20 21
21static unsigned int cyc2ns_scale __read_mostly; 22/* Accelerators for sched_clock()
23 * convert from cycles(64bits) => nanoseconds (64bits)
24 * basic equation:
25 * ns = cycles / (freq / ns_per_sec)
26 * ns = cycles * (ns_per_sec / freq)
27 * ns = cycles * (10^9 / (cpu_khz * 10^3))
28 * ns = cycles * (10^6 / cpu_khz)
29 *
30 * Then we use scaling math (suggested by george@mvista.com) to get:
31 * ns = cycles * (10^6 * SC / cpu_khz) / SC
32 * ns = cycles * cyc2ns_scale / SC
33 *
34 * And since SC is a constant power of two, we can convert the div
35 * into a shift.
36 *
37 * We can use khz divisor instead of mhz to keep a better precision, since
38 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
39 * (mathieu.desnoyers@polymtl.ca)
40 *
41 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
42 */
43DEFINE_PER_CPU(unsigned long, cyc2ns);
22 44
23static inline void set_cyc2ns_scale(unsigned long khz) 45static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
24{ 46{
25 cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; 47 unsigned long flags, prev_scale, *scale;
26} 48 unsigned long long tsc_now, ns_now;
27 49
28static unsigned long long cycles_2_ns(unsigned long long cyc) 50 local_irq_save(flags);
29{ 51 sched_clock_idle_sleep_event();
30 return (cyc * cyc2ns_scale) >> NS_SCALE; 52
53 scale = &per_cpu(cyc2ns, cpu);
54
55 rdtscll(tsc_now);
56 ns_now = __cycles_2_ns(tsc_now);
57
58 prev_scale = *scale;
59 if (cpu_khz)
60 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
61
62 sched_clock_idle_wakeup_event(0);
63 local_irq_restore(flags);
31} 64}
32 65
33unsigned long long sched_clock(void) 66unsigned long long native_sched_clock(void)
34{ 67{
35 unsigned long a = 0; 68 unsigned long a = 0;
36 69
@@ -44,12 +77,27 @@ unsigned long long sched_clock(void)
44 return cycles_2_ns(a); 77 return cycles_2_ns(a);
45} 78}
46 79
80/* We need to define a real function for sched_clock, to override the
81 weak default version */
82#ifdef CONFIG_PARAVIRT
83unsigned long long sched_clock(void)
84{
85 return paravirt_sched_clock();
86}
87#else
88unsigned long long
89sched_clock(void) __attribute__((alias("native_sched_clock")));
90#endif
91
92
47static int tsc_unstable; 93static int tsc_unstable;
48 94
49inline int check_tsc_unstable(void) 95int check_tsc_unstable(void)
50{ 96{
51 return tsc_unstable; 97 return tsc_unstable;
52} 98}
99EXPORT_SYMBOL_GPL(check_tsc_unstable);
100
53#ifdef CONFIG_CPU_FREQ 101#ifdef CONFIG_CPU_FREQ
54 102
55/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency 103/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -100,7 +148,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
100 mark_tsc_unstable("cpufreq changes"); 148 mark_tsc_unstable("cpufreq changes");
101 } 149 }
102 150
103 set_cyc2ns_scale(tsc_khz_ref); 151 preempt_disable();
152 set_cyc2ns_scale(tsc_khz_ref, smp_processor_id());
153 preempt_enable();
104 154
105 return 0; 155 return 0;
106} 156}
@@ -133,12 +183,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
133 int i; 183 int i;
134 184
135 for (i = 0; i < MAX_RETRIES; i++) { 185 for (i = 0; i < MAX_RETRIES; i++) {
136 t1 = get_cycles_sync(); 186 t1 = get_cycles();
137 if (hpet) 187 if (hpet)
138 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 188 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
139 else 189 else
140 *pm = acpi_pm_read_early(); 190 *pm = acpi_pm_read_early();
141 t2 = get_cycles_sync(); 191 t2 = get_cycles();
142 if ((t2 - t1) < SMI_TRESHOLD) 192 if ((t2 - t1) < SMI_TRESHOLD)
143 return t2; 193 return t2;
144 } 194 }
@@ -151,7 +201,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm,
151void __init tsc_calibrate(void) 201void __init tsc_calibrate(void)
152{ 202{
153 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; 203 unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2;
154 int hpet = is_hpet_enabled(); 204 int hpet = is_hpet_enabled(), cpu;
155 205
156 local_irq_save(flags); 206 local_irq_save(flags);
157 207
@@ -162,9 +212,9 @@ void __init tsc_calibrate(void)
162 outb(0xb0, 0x43); 212 outb(0xb0, 0x43);
163 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 213 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
164 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 214 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
165 tr1 = get_cycles_sync(); 215 tr1 = get_cycles();
166 while ((inb(0x61) & 0x20) == 0); 216 while ((inb(0x61) & 0x20) == 0);
167 tr2 = get_cycles_sync(); 217 tr2 = get_cycles();
168 218
169 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 219 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
170 220
@@ -206,7 +256,9 @@ void __init tsc_calibrate(void)
206 } 256 }
207 257
208 tsc_khz = tsc2 / tsc1; 258 tsc_khz = tsc2 / tsc1;
209 set_cyc2ns_scale(tsc_khz); 259
260 for_each_possible_cpu(cpu)
261 set_cyc2ns_scale(tsc_khz, cpu);
210} 262}
211 263
212/* 264/*
@@ -222,17 +274,9 @@ __cpuinit int unsynchronized_tsc(void)
222 if (apic_is_clustered_box()) 274 if (apic_is_clustered_box())
223 return 1; 275 return 1;
224#endif 276#endif
225 /* Most intel systems have synchronized TSCs except for 277
226 multi node systems */ 278 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
227 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
228#ifdef CONFIG_ACPI
229 /* But TSC doesn't tick in C3 so don't use it there */
230 if (acpi_gbl_FADT.header.length > 0 &&
231 acpi_gbl_FADT.C3latency < 1000)
232 return 1;
233#endif
234 return 0; 279 return 0;
235 }
236 280
237 /* Assume multi socket systems are not synchronized */ 281 /* Assume multi socket systems are not synchronized */
238 return num_present_cpus() > 1; 282 return num_present_cpus() > 1;
@@ -250,13 +294,13 @@ __setup("notsc", notsc_setup);
250/* clock source code: */ 294/* clock source code: */
251static cycle_t read_tsc(void) 295static cycle_t read_tsc(void)
252{ 296{
253 cycle_t ret = (cycle_t)get_cycles_sync(); 297 cycle_t ret = (cycle_t)get_cycles();
254 return ret; 298 return ret;
255} 299}
256 300
257static cycle_t __vsyscall_fn vread_tsc(void) 301static cycle_t __vsyscall_fn vread_tsc(void)
258{ 302{
259 cycle_t ret = (cycle_t)get_cycles_sync(); 303 cycle_t ret = (cycle_t)vget_cycles();
260 return ret; 304 return ret;
261} 305}
262 306
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 9125efe66a06..0577825cf89b 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void)
46 cycles_t start, now, prev, end; 46 cycles_t start, now, prev, end;
47 int i; 47 int i;
48 48
49 start = get_cycles_sync(); 49 start = get_cycles();
50 /* 50 /*
51 * The measurement runs for 20 msecs: 51 * The measurement runs for 20 msecs:
52 */ 52 */
@@ -61,18 +61,18 @@ static __cpuinit void check_tsc_warp(void)
61 */ 61 */
62 __raw_spin_lock(&sync_lock); 62 __raw_spin_lock(&sync_lock);
63 prev = last_tsc; 63 prev = last_tsc;
64 now = get_cycles_sync(); 64 now = get_cycles();
65 last_tsc = now; 65 last_tsc = now;
66 __raw_spin_unlock(&sync_lock); 66 __raw_spin_unlock(&sync_lock);
67 67
68 /* 68 /*
69 * Be nice every now and then (and also check whether 69 * Be nice every now and then (and also check whether
70 * measurement is done [we also insert a 100 million 70 * measurement is done [we also insert a 10 million
71 * loops safety exit, so we dont lock up in case the 71 * loops safety exit, so we dont lock up in case the
72 * TSC readout is totally broken]): 72 * TSC readout is totally broken]):
73 */ 73 */
74 if (unlikely(!(i & 7))) { 74 if (unlikely(!(i & 7))) {
75 if (now > end || i > 100000000) 75 if (now > end || i > 10000000)
76 break; 76 break;
77 cpu_relax(); 77 cpu_relax();
78 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
@@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void)
87 nr_warps++; 87 nr_warps++;
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 90 }
91 if (!(now-start)) {
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start);
94 WARN_ON(1);
91 } 95 }
92} 96}
93 97
@@ -129,24 +133,24 @@ void __cpuinit check_tsc_sync_source(int cpu)
129 while (atomic_read(&stop_count) != cpus-1) 133 while (atomic_read(&stop_count) != cpus-1)
130 cpu_relax(); 134 cpu_relax();
131 135
132 /*
133 * Reset it - just in case we boot another CPU later:
134 */
135 atomic_set(&start_count, 0);
136
137 if (nr_warps) { 136 if (nr_warps) {
138 printk("\n"); 137 printk("\n");
139 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," 138 printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
140 " turning off TSC clock.\n", max_warp); 139 " turning off TSC clock.\n", max_warp);
141 mark_tsc_unstable("check_tsc_sync_source failed"); 140 mark_tsc_unstable("check_tsc_sync_source failed");
142 nr_warps = 0;
143 max_warp = 0;
144 last_tsc = 0;
145 } else { 141 } else {
146 printk(" passed.\n"); 142 printk(" passed.\n");
147 } 143 }
148 144
149 /* 145 /*
146 * Reset it - just in case we boot another CPU later:
147 */
148 atomic_set(&start_count, 0);
149 nr_warps = 0;
150 max_warp = 0;
151 last_tsc = 0;
152
153 /*
150 * Let the target continue with the bootup: 154 * Let the target continue with the bootup:
151 */ 155 */
152 atomic_inc(&stop_count); 156 atomic_inc(&stop_count);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 157e4bedd3c5..738c2104df30 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -70,10 +70,10 @@
70/* 70/*
71 * 8- and 16-bit register defines.. 71 * 8- and 16-bit register defines..
72 */ 72 */
73#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) 73#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0])
74#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) 74#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1])
75#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) 75#define IP(regs) (*(unsigned short *)&((regs)->pt.ip))
76#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) 76#define SP(regs) (*(unsigned short *)&((regs)->pt.sp))
77 77
78/* 78/*
79 * virtual flags (16 and 32-bit versions) 79 * virtual flags (16 and 32-bit versions)
@@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
93{ 93{
94 int ret = 0; 94 int ret = 0;
95 95
96 /* kernel_vm86_regs is missing xgs, so copy everything up to 96 /* kernel_vm86_regs is missing gs, so copy everything up to
97 (but not including) orig_eax, and then rest including orig_eax. */ 97 (but not including) orig_eax, and then rest including orig_eax. */
98 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); 98 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
99 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax, 99 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
100 sizeof(struct kernel_vm86_regs) - 100 sizeof(struct kernel_vm86_regs) -
101 offsetof(struct kernel_vm86_regs, pt.orig_eax)); 101 offsetof(struct kernel_vm86_regs, pt.orig_ax));
102 102
103 return ret; 103 return ret;
104} 104}
@@ -110,18 +110,17 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
110{ 110{
111 int ret = 0; 111 int ret = 0;
112 112
113 /* copy eax-xfs inclusive */ 113 /* copy ax-fs inclusive */
114 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); 114 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
115 /* copy orig_eax-__gsh+extra */ 115 /* copy orig_ax-__gsh+extra */
116 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax, 116 ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
117 sizeof(struct kernel_vm86_regs) - 117 sizeof(struct kernel_vm86_regs) -
118 offsetof(struct kernel_vm86_regs, pt.orig_eax) + 118 offsetof(struct kernel_vm86_regs, pt.orig_ax) +
119 extra); 119 extra);
120 return ret; 120 return ret;
121} 121}
122 122
123struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 123struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs)
124struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
125{ 124{
126 struct tss_struct *tss; 125 struct tss_struct *tss;
127 struct pt_regs *ret; 126 struct pt_regs *ret;
@@ -138,7 +137,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
138 printk("no vm86_info: BAD\n"); 137 printk("no vm86_info: BAD\n");
139 do_exit(SIGSEGV); 138 do_exit(SIGSEGV);
140 } 139 }
141 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 140 set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask);
142 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs); 141 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
143 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 142 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
144 if (tmp) { 143 if (tmp) {
@@ -147,15 +146,15 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
147 } 146 }
148 147
149 tss = &per_cpu(init_tss, get_cpu()); 148 tss = &per_cpu(init_tss, get_cpu());
150 current->thread.esp0 = current->thread.saved_esp0; 149 current->thread.sp0 = current->thread.saved_sp0;
151 current->thread.sysenter_cs = __KERNEL_CS; 150 current->thread.sysenter_cs = __KERNEL_CS;
152 load_esp0(tss, &current->thread); 151 load_sp0(tss, &current->thread);
153 current->thread.saved_esp0 = 0; 152 current->thread.saved_sp0 = 0;
154 put_cpu(); 153 put_cpu();
155 154
156 ret = KVM86->regs32; 155 ret = KVM86->regs32;
157 156
158 ret->xfs = current->thread.saved_fs; 157 ret->fs = current->thread.saved_fs;
159 loadsegment(gs, current->thread.saved_gs); 158 loadsegment(gs, current->thread.saved_gs);
160 159
161 return ret; 160 return ret;
@@ -197,7 +196,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
197 196
198asmlinkage int sys_vm86old(struct pt_regs regs) 197asmlinkage int sys_vm86old(struct pt_regs regs)
199{ 198{
200 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; 199 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx;
201 struct kernel_vm86_struct info; /* declare this _on top_, 200 struct kernel_vm86_struct info; /* declare this _on top_,
202 * this avoids wasting of stack space. 201 * this avoids wasting of stack space.
203 * This remains on the stack until we 202 * This remains on the stack until we
@@ -207,7 +206,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
207 int tmp, ret = -EPERM; 206 int tmp, ret = -EPERM;
208 207
209 tsk = current; 208 tsk = current;
210 if (tsk->thread.saved_esp0) 209 if (tsk->thread.saved_sp0)
211 goto out; 210 goto out;
212 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 211 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
213 offsetof(struct kernel_vm86_struct, vm86plus) - 212 offsetof(struct kernel_vm86_struct, vm86plus) -
@@ -237,12 +236,12 @@ asmlinkage int sys_vm86(struct pt_regs regs)
237 struct vm86plus_struct __user *v86; 236 struct vm86plus_struct __user *v86;
238 237
239 tsk = current; 238 tsk = current;
240 switch (regs.ebx) { 239 switch (regs.bx) {
241 case VM86_REQUEST_IRQ: 240 case VM86_REQUEST_IRQ:
242 case VM86_FREE_IRQ: 241 case VM86_FREE_IRQ:
243 case VM86_GET_IRQ_BITS: 242 case VM86_GET_IRQ_BITS:
244 case VM86_GET_AND_RESET_IRQ: 243 case VM86_GET_AND_RESET_IRQ:
245 ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); 244 ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
246 goto out; 245 goto out;
247 case VM86_PLUS_INSTALL_CHECK: 246 case VM86_PLUS_INSTALL_CHECK:
248 /* NOTE: on old vm86 stuff this will return the error 247 /* NOTE: on old vm86 stuff this will return the error
@@ -256,9 +255,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
256 255
257 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ 256 /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
258 ret = -EPERM; 257 ret = -EPERM;
259 if (tsk->thread.saved_esp0) 258 if (tsk->thread.saved_sp0)
260 goto out; 259 goto out;
261 v86 = (struct vm86plus_struct __user *)regs.ecx; 260 v86 = (struct vm86plus_struct __user *)regs.cx;
262 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 261 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
263 offsetof(struct kernel_vm86_struct, regs32) - 262 offsetof(struct kernel_vm86_struct, regs32) -
264 sizeof(info.regs)); 263 sizeof(info.regs));
@@ -281,23 +280,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
281/* 280/*
282 * make sure the vm86() system call doesn't try to do anything silly 281 * make sure the vm86() system call doesn't try to do anything silly
283 */ 282 */
284 info->regs.pt.xds = 0; 283 info->regs.pt.ds = 0;
285 info->regs.pt.xes = 0; 284 info->regs.pt.es = 0;
286 info->regs.pt.xfs = 0; 285 info->regs.pt.fs = 0;
287 286
288/* we are clearing gs later just before "jmp resume_userspace", 287/* we are clearing gs later just before "jmp resume_userspace",
289 * because it is not saved/restored. 288 * because it is not saved/restored.
290 */ 289 */
291 290
292/* 291/*
293 * The eflags register is also special: we cannot trust that the user 292 * The flags register is also special: we cannot trust that the user
294 * has set it up safely, so this makes sure interrupt etc flags are 293 * has set it up safely, so this makes sure interrupt etc flags are
295 * inherited from protected mode. 294 * inherited from protected mode.
296 */ 295 */
297 VEFLAGS = info->regs.pt.eflags; 296 VEFLAGS = info->regs.pt.flags;
298 info->regs.pt.eflags &= SAFE_MASK; 297 info->regs.pt.flags &= SAFE_MASK;
299 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; 298 info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
300 info->regs.pt.eflags |= VM_MASK; 299 info->regs.pt.flags |= VM_MASK;
301 300
302 switch (info->cpu_type) { 301 switch (info->cpu_type) {
303 case CPU_286: 302 case CPU_286:
@@ -315,18 +314,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
315 } 314 }
316 315
317/* 316/*
318 * Save old state, set default return value (%eax) to 0 317 * Save old state, set default return value (%ax) to 0
319 */ 318 */
320 info->regs32->eax = 0; 319 info->regs32->ax = 0;
321 tsk->thread.saved_esp0 = tsk->thread.esp0; 320 tsk->thread.saved_sp0 = tsk->thread.sp0;
322 tsk->thread.saved_fs = info->regs32->xfs; 321 tsk->thread.saved_fs = info->regs32->fs;
323 savesegment(gs, tsk->thread.saved_gs); 322 savesegment(gs, tsk->thread.saved_gs);
324 323
325 tss = &per_cpu(init_tss, get_cpu()); 324 tss = &per_cpu(init_tss, get_cpu());
326 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 325 tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
327 if (cpu_has_sep) 326 if (cpu_has_sep)
328 tsk->thread.sysenter_cs = 0; 327 tsk->thread.sysenter_cs = 0;
329 load_esp0(tss, &tsk->thread); 328 load_sp0(tss, &tsk->thread);
330 put_cpu(); 329 put_cpu();
331 330
332 tsk->thread.screen_bitmap = info->screen_bitmap; 331 tsk->thread.screen_bitmap = info->screen_bitmap;
@@ -352,7 +351,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
352 struct pt_regs * regs32; 351 struct pt_regs * regs32;
353 352
354 regs32 = save_v86_state(regs16); 353 regs32 = save_v86_state(regs16);
355 regs32->eax = retval; 354 regs32->ax = retval;
356 __asm__ __volatile__("movl %0,%%esp\n\t" 355 __asm__ __volatile__("movl %0,%%esp\n\t"
357 "movl %1,%%ebp\n\t" 356 "movl %1,%%ebp\n\t"
358 "jmp resume_userspace" 357 "jmp resume_userspace"
@@ -373,30 +372,30 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
373 372
374static inline void clear_TF(struct kernel_vm86_regs * regs) 373static inline void clear_TF(struct kernel_vm86_regs * regs)
375{ 374{
376 regs->pt.eflags &= ~TF_MASK; 375 regs->pt.flags &= ~TF_MASK;
377} 376}
378 377
379static inline void clear_AC(struct kernel_vm86_regs * regs) 378static inline void clear_AC(struct kernel_vm86_regs * regs)
380{ 379{
381 regs->pt.eflags &= ~AC_MASK; 380 regs->pt.flags &= ~AC_MASK;
382} 381}
383 382
384/* It is correct to call set_IF(regs) from the set_vflags_* 383/* It is correct to call set_IF(regs) from the set_vflags_*
385 * functions. However someone forgot to call clear_IF(regs) 384 * functions. However someone forgot to call clear_IF(regs)
386 * in the opposite case. 385 * in the opposite case.
387 * After the command sequence CLI PUSHF STI POPF you should 386 * After the command sequence CLI PUSHF STI POPF you should
388 * end up with interrups disabled, but you ended up with 387 * end up with interrupts disabled, but you ended up with
389 * interrupts enabled. 388 * interrupts enabled.
390 * ( I was testing my own changes, but the only bug I 389 * ( I was testing my own changes, but the only bug I
391 * could find was in a function I had not changed. ) 390 * could find was in a function I had not changed. )
392 * [KD] 391 * [KD]
393 */ 392 */
394 393
395static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 394static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs)
396{ 395{
397 set_flags(VEFLAGS, eflags, current->thread.v86mask); 396 set_flags(VEFLAGS, flags, current->thread.v86mask);
398 set_flags(regs->pt.eflags, eflags, SAFE_MASK); 397 set_flags(regs->pt.flags, flags, SAFE_MASK);
399 if (eflags & IF_MASK) 398 if (flags & IF_MASK)
400 set_IF(regs); 399 set_IF(regs);
401 else 400 else
402 clear_IF(regs); 401 clear_IF(regs);
@@ -405,7 +404,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
405static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 404static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
406{ 405{
407 set_flags(VFLAGS, flags, current->thread.v86mask); 406 set_flags(VFLAGS, flags, current->thread.v86mask);
408 set_flags(regs->pt.eflags, flags, SAFE_MASK); 407 set_flags(regs->pt.flags, flags, SAFE_MASK);
409 if (flags & IF_MASK) 408 if (flags & IF_MASK)
410 set_IF(regs); 409 set_IF(regs);
411 else 410 else
@@ -414,7 +413,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
414 413
415static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 414static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
416{ 415{
417 unsigned long flags = regs->pt.eflags & RETURN_MASK; 416 unsigned long flags = regs->pt.flags & RETURN_MASK;
418 417
419 if (VEFLAGS & VIF_MASK) 418 if (VEFLAGS & VIF_MASK)
420 flags |= IF_MASK; 419 flags |= IF_MASK;
@@ -518,7 +517,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
518 unsigned long __user *intr_ptr; 517 unsigned long __user *intr_ptr;
519 unsigned long segoffs; 518 unsigned long segoffs;
520 519
521 if (regs->pt.xcs == BIOSSEG) 520 if (regs->pt.cs == BIOSSEG)
522 goto cannot_handle; 521 goto cannot_handle;
523 if (is_revectored(i, &KVM86->int_revectored)) 522 if (is_revectored(i, &KVM86->int_revectored))
524 goto cannot_handle; 523 goto cannot_handle;
@@ -530,9 +529,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
530 if ((segoffs >> 16) == BIOSSEG) 529 if ((segoffs >> 16) == BIOSSEG)
531 goto cannot_handle; 530 goto cannot_handle;
532 pushw(ssp, sp, get_vflags(regs), cannot_handle); 531 pushw(ssp, sp, get_vflags(regs), cannot_handle);
533 pushw(ssp, sp, regs->pt.xcs, cannot_handle); 532 pushw(ssp, sp, regs->pt.cs, cannot_handle);
534 pushw(ssp, sp, IP(regs), cannot_handle); 533 pushw(ssp, sp, IP(regs), cannot_handle);
535 regs->pt.xcs = segoffs >> 16; 534 regs->pt.cs = segoffs >> 16;
536 SP(regs) -= 6; 535 SP(regs) -= 6;
537 IP(regs) = segoffs & 0xffff; 536 IP(regs) = segoffs & 0xffff;
538 clear_TF(regs); 537 clear_TF(regs);
@@ -549,7 +548,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
549 if (VMPI.is_vm86pus) { 548 if (VMPI.is_vm86pus) {
550 if ( (trapno==3) || (trapno==1) ) 549 if ( (trapno==3) || (trapno==1) )
551 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 550 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
552 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); 551 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
553 return 0; 552 return 0;
554 } 553 }
555 if (trapno !=1) 554 if (trapno !=1)
@@ -585,10 +584,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
585 handle_vm86_trap(regs, 0, 1); \ 584 handle_vm86_trap(regs, 0, 1); \
586 return; } while (0) 585 return; } while (0)
587 586
588 orig_flags = *(unsigned short *)&regs->pt.eflags; 587 orig_flags = *(unsigned short *)&regs->pt.flags;
589 588
590 csp = (unsigned char __user *) (regs->pt.xcs << 4); 589 csp = (unsigned char __user *) (regs->pt.cs << 4);
591 ssp = (unsigned char __user *) (regs->pt.xss << 4); 590 ssp = (unsigned char __user *) (regs->pt.ss << 4);
592 sp = SP(regs); 591 sp = SP(regs);
593 ip = IP(regs); 592 ip = IP(regs);
594 593
@@ -675,7 +674,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
675 SP(regs) += 6; 674 SP(regs) += 6;
676 } 675 }
677 IP(regs) = newip; 676 IP(regs) = newip;
678 regs->pt.xcs = newcs; 677 regs->pt.cs = newcs;
679 CHECK_IF_IN_TRAP; 678 CHECK_IF_IN_TRAP;
680 if (data32) { 679 if (data32) {
681 set_vflags_long(newflags, regs); 680 set_vflags_long(newflags, regs);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index f02bad68abaa..12affe1f9bce 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -62,7 +62,10 @@ static struct {
62 void (*cpuid)(void /* non-c */); 62 void (*cpuid)(void /* non-c */);
63 void (*_set_ldt)(u32 selector); 63 void (*_set_ldt)(u32 selector);
64 void (*set_tr)(u32 selector); 64 void (*set_tr)(u32 selector);
65 void (*set_kernel_stack)(u32 selector, u32 esp0); 65 void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
66 void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
67 void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
68 void (*set_kernel_stack)(u32 selector, u32 sp0);
66 void (*allocate_page)(u32, u32, u32, u32, u32); 69 void (*allocate_page)(u32, u32, u32, u32, u32);
67 void (*release_page)(u32, u32); 70 void (*release_page)(u32, u32);
68 void (*set_pte)(pte_t, pte_t *, unsigned); 71 void (*set_pte)(pte_t, pte_t *, unsigned);
@@ -88,13 +91,13 @@ struct vmi_timer_ops vmi_timer_ops;
88#define IRQ_PATCH_DISABLE 5 91#define IRQ_PATCH_DISABLE 5
89 92
90static inline void patch_offset(void *insnbuf, 93static inline void patch_offset(void *insnbuf,
91 unsigned long eip, unsigned long dest) 94 unsigned long ip, unsigned long dest)
92{ 95{
93 *(unsigned long *)(insnbuf+1) = dest-eip-5; 96 *(unsigned long *)(insnbuf+1) = dest-ip-5;
94} 97}
95 98
96static unsigned patch_internal(int call, unsigned len, void *insnbuf, 99static unsigned patch_internal(int call, unsigned len, void *insnbuf,
97 unsigned long eip) 100 unsigned long ip)
98{ 101{
99 u64 reloc; 102 u64 reloc;
100 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; 103 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
@@ -103,13 +106,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
103 case VMI_RELOCATION_CALL_REL: 106 case VMI_RELOCATION_CALL_REL:
104 BUG_ON(len < 5); 107 BUG_ON(len < 5);
105 *(char *)insnbuf = MNEM_CALL; 108 *(char *)insnbuf = MNEM_CALL;
106 patch_offset(insnbuf, eip, (unsigned long)rel->eip); 109 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
107 return 5; 110 return 5;
108 111
109 case VMI_RELOCATION_JUMP_REL: 112 case VMI_RELOCATION_JUMP_REL:
110 BUG_ON(len < 5); 113 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_JMP; 114 *(char *)insnbuf = MNEM_JMP;
112 patch_offset(insnbuf, eip, (unsigned long)rel->eip); 115 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
113 return 5; 116 return 5;
114 117
115 case VMI_RELOCATION_NOP: 118 case VMI_RELOCATION_NOP:
@@ -131,25 +134,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf,
131 * sequence. The callee does nop padding for us. 134 * sequence. The callee does nop padding for us.
132 */ 135 */
133static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, 136static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len) 137 unsigned long ip, unsigned len)
135{ 138{
136 switch (type) { 139 switch (type) {
137 case PARAVIRT_PATCH(pv_irq_ops.irq_disable): 140 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len, 141 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip); 142 insns, ip);
140 case PARAVIRT_PATCH(pv_irq_ops.irq_enable): 143 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, 144 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip); 145 insns, ip);
143 case PARAVIRT_PATCH(pv_irq_ops.restore_fl): 146 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len, 147 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip); 148 insns, ip);
146 case PARAVIRT_PATCH(pv_irq_ops.save_fl): 149 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len, 150 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip); 151 insns, ip);
149 case PARAVIRT_PATCH(pv_cpu_ops.iret): 152 case PARAVIRT_PATCH(pv_cpu_ops.iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip); 153 return patch_internal(VMI_CALL_IRET, len, insns, ip);
151 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): 154 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); 155 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
153 default: 156 default:
154 break; 157 break;
155 } 158 }
@@ -157,36 +160,36 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
157} 160}
158 161
159/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ 162/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
160static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, 163static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
161 unsigned int *ecx, unsigned int *edx) 164 unsigned int *cx, unsigned int *dx)
162{ 165{
163 int override = 0; 166 int override = 0;
164 if (*eax == 1) 167 if (*ax == 1)
165 override = 1; 168 override = 1;
166 asm volatile ("call *%6" 169 asm volatile ("call *%6"
167 : "=a" (*eax), 170 : "=a" (*ax),
168 "=b" (*ebx), 171 "=b" (*bx),
169 "=c" (*ecx), 172 "=c" (*cx),
170 "=d" (*edx) 173 "=d" (*dx)
171 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); 174 : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
172 if (override) { 175 if (override) {
173 if (disable_pse) 176 if (disable_pse)
174 *edx &= ~X86_FEATURE_PSE; 177 *dx &= ~X86_FEATURE_PSE;
175 if (disable_pge) 178 if (disable_pge)
176 *edx &= ~X86_FEATURE_PGE; 179 *dx &= ~X86_FEATURE_PGE;
177 if (disable_sep) 180 if (disable_sep)
178 *edx &= ~X86_FEATURE_SEP; 181 *dx &= ~X86_FEATURE_SEP;
179 if (disable_tsc) 182 if (disable_tsc)
180 *edx &= ~X86_FEATURE_TSC; 183 *dx &= ~X86_FEATURE_TSC;
181 if (disable_mtrr) 184 if (disable_mtrr)
182 *edx &= ~X86_FEATURE_MTRR; 185 *dx &= ~X86_FEATURE_MTRR;
183 } 186 }
184} 187}
185 188
186static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) 189static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
187{ 190{
188 if (gdt[nr].a != new->a || gdt[nr].b != new->b) 191 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
189 write_gdt_entry(gdt, nr, new->a, new->b); 192 write_gdt_entry(gdt, nr, new, 0);
190} 193}
191 194
192static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) 195static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
@@ -200,12 +203,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
200static void vmi_set_ldt(const void *addr, unsigned entries) 203static void vmi_set_ldt(const void *addr, unsigned entries)
201{ 204{
202 unsigned cpu = smp_processor_id(); 205 unsigned cpu = smp_processor_id();
203 u32 low, high; 206 struct desc_struct desc;
204 207
205 pack_descriptor(&low, &high, (unsigned long)addr, 208 pack_descriptor(&desc, (unsigned long)addr,
206 entries * sizeof(struct desc_struct) - 1, 209 entries * sizeof(struct desc_struct) - 1,
207 DESCTYPE_LDT, 0); 210 DESC_LDT, 0);
208 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); 211 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
209 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); 212 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
210} 213}
211 214
@@ -214,17 +217,37 @@ static void vmi_set_tr(void)
214 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); 217 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
215} 218}
216 219
217static void vmi_load_esp0(struct tss_struct *tss, 220static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
221{
222 u32 *idt_entry = (u32 *)g;
223 vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
224}
225
226static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
227 const void *desc, int type)
228{
229 u32 *gdt_entry = (u32 *)desc;
230 vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
231}
232
233static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
234 const void *desc)
235{
236 u32 *ldt_entry = (u32 *)desc;
237 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
238}
239
240static void vmi_load_sp0(struct tss_struct *tss,
218 struct thread_struct *thread) 241 struct thread_struct *thread)
219{ 242{
220 tss->x86_tss.esp0 = thread->esp0; 243 tss->x86_tss.sp0 = thread->sp0;
221 244
222 /* This can only happen when SEP is enabled, no need to test "SEP"arately */ 245 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
223 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { 246 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
224 tss->x86_tss.ss1 = thread->sysenter_cs; 247 tss->x86_tss.ss1 = thread->sysenter_cs;
225 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); 248 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
226 } 249 }
227 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); 250 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
228} 251}
229 252
230static void vmi_flush_tlb_user(void) 253static void vmi_flush_tlb_user(void)
@@ -375,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
375 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
376} 399}
377 400
378static void vmi_allocate_pd(u32 pfn) 401static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
379{ 402{
380 /* 403 /*
381 * This call comes in very early, before mem_map is setup. 404 * This call comes in very early, before mem_map is setup.
@@ -452,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep
452static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) 475static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
453{ 476{
454#ifdef CONFIG_X86_PAE 477#ifdef CONFIG_X86_PAE
455 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; 478 const pte_t pte = { .pte = pmdval.pmd };
456 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); 479 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
457#else 480#else
458 const pte_t pte = { pmdval.pud.pgd.pgd }; 481 const pte_t pte = { pmdval.pud.pgd.pgd };
@@ -485,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t
485static void vmi_set_pud(pud_t *pudp, pud_t pudval) 508static void vmi_set_pud(pud_t *pudp, pud_t pudval)
486{ 509{
487 /* Um, eww */ 510 /* Um, eww */
488 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; 511 const pte_t pte = { .pte = pudval.pgd.pgd };
489 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); 512 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
490 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); 513 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
491} 514}
492 515
493static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 516static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
494{ 517{
495 const pte_t pte = { 0 }; 518 const pte_t pte = { .pte = 0 };
496 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); 519 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
497 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); 520 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
498} 521}
499 522
500static void vmi_pmd_clear(pmd_t *pmd) 523static void vmi_pmd_clear(pmd_t *pmd)
501{ 524{
502 const pte_t pte = { 0 }; 525 const pte_t pte = { .pte = 0 };
503 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); 526 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
504 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); 527 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
505} 528}
@@ -790,10 +813,13 @@ static inline int __init activate_vmi(void)
790 para_fill(pv_cpu_ops.store_idt, GetIDT); 813 para_fill(pv_cpu_ops.store_idt, GetIDT);
791 para_fill(pv_cpu_ops.store_tr, GetTR); 814 para_fill(pv_cpu_ops.store_tr, GetTR);
792 pv_cpu_ops.load_tls = vmi_load_tls; 815 pv_cpu_ops.load_tls = vmi_load_tls;
793 para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); 816 para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
794 para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); 817 write_ldt_entry, WriteLDTEntry);
795 para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); 818 para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
796 para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); 819 write_gdt_entry, WriteGDTEntry);
820 para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
821 write_idt_entry, WriteIDTEntry);
822 para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
797 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); 823 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
798 para_fill(pv_cpu_ops.io_delay, IODelay); 824 para_fill(pv_cpu_ops.io_delay, IODelay);
799 825
@@ -870,7 +896,7 @@ static inline int __init activate_vmi(void)
870 * the backend. They are performance critical anyway, so requiring 896 * the backend. They are performance critical anyway, so requiring
871 * a patch is not a big problem. 897 * a patch is not a big problem.
872 */ 898 */
873 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; 899 pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0;
874 pv_cpu_ops.iret = (void *)0xbadbab0; 900 pv_cpu_ops.iret = (void *)0xbadbab0;
875 901
876#ifdef CONFIG_SMP 902#ifdef CONFIG_SMP
@@ -963,19 +989,19 @@ static int __init parse_vmi(char *arg)
963 return -EINVAL; 989 return -EINVAL;
964 990
965 if (!strcmp(arg, "disable_pge")) { 991 if (!strcmp(arg, "disable_pge")) {
966 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); 992 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
967 disable_pge = 1; 993 disable_pge = 1;
968 } else if (!strcmp(arg, "disable_pse")) { 994 } else if (!strcmp(arg, "disable_pse")) {
969 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); 995 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
970 disable_pse = 1; 996 disable_pse = 1;
971 } else if (!strcmp(arg, "disable_sep")) { 997 } else if (!strcmp(arg, "disable_sep")) {
972 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); 998 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
973 disable_sep = 1; 999 disable_sep = 1;
974 } else if (!strcmp(arg, "disable_tsc")) { 1000 } else if (!strcmp(arg, "disable_tsc")) {
975 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); 1001 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
976 disable_tsc = 1; 1002 disable_tsc = 1;
977 } else if (!strcmp(arg, "disable_mtrr")) { 1003 } else if (!strcmp(arg, "disable_mtrr")) {
978 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); 1004 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
979 disable_mtrr = 1; 1005 disable_mtrr = 1;
980 } else if (!strcmp(arg, "disable_timer")) { 1006 } else if (!strcmp(arg, "disable_timer")) {
981 disable_vmi_timer = 1; 1007 disable_vmi_timer = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index b1b5ab08b26e..a2b030780aa9 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -35,7 +35,6 @@
35#include <asm/i8253.h> 35#include <asm/i8253.h>
36 36
37#include <irq_vectors.h> 37#include <irq_vectors.h>
38#include "io_ports.h"
39 38
40#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 39#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
41#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) 40#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
@@ -238,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void)
238void __init vmi_time_init(void) 237void __init vmi_time_init(void)
239{ 238{
240 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ 239 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
241 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ 240 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
242 241
243 vmi_time_init_clockevent(); 242 vmi_time_init_clockevent();
244 setup_irq(0, &vmi_clock_action); 243 setup_irq(0, &vmi_clock_action);
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 7d72cce00529..f1148ac8abe3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -8,12 +8,6 @@
8 * put it inside the section definition. 8 * put it inside the section definition.
9 */ 9 */
10 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
17#define LOAD_OFFSET __PAGE_OFFSET 11#define LOAD_OFFSET __PAGE_OFFSET
18 12
19#include <asm-generic/vmlinux.lds.h> 13#include <asm-generic/vmlinux.lds.h>
@@ -44,6 +38,8 @@ SECTIONS
44 38
45 /* read-only */ 39 /* read-only */
46 .text : AT(ADDR(.text) - LOAD_OFFSET) { 40 .text : AT(ADDR(.text) - LOAD_OFFSET) {
41 . = ALIGN(4096); /* not really needed, already page aligned */
42 *(.text.page_aligned)
47 TEXT_TEXT 43 TEXT_TEXT
48 SCHED_TEXT 44 SCHED_TEXT
49 LOCK_TEXT 45 LOCK_TEXT
@@ -131,10 +127,12 @@ SECTIONS
131 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 127 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
132 __init_begin = .; 128 __init_begin = .;
133 _sinittext = .; 129 _sinittext = .;
134 *(.init.text) 130 INIT_TEXT
135 _einittext = .; 131 _einittext = .;
136 } 132 }
137 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 133 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
134 INIT_DATA
135 }
138 . = ALIGN(16); 136 . = ALIGN(16);
139 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { 137 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
140 __setup_start = .; 138 __setup_start = .;
@@ -169,8 +167,12 @@ SECTIONS
169 } 167 }
170 /* .exit.text is discard at runtime, not link time, to deal with references 168 /* .exit.text is discard at runtime, not link time, to deal with references
171 from .altinstructions and .eh_frame */ 169 from .altinstructions and .eh_frame */
172 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 170 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
173 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 171 EXIT_TEXT
172 }
173 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
174 EXIT_DATA
175 }
174#if defined(CONFIG_BLK_DEV_INITRD) 176#if defined(CONFIG_BLK_DEV_INITRD)
175 . = ALIGN(4096); 177 . = ALIGN(4096);
176 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { 178 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index ba8ea97abd21..0992b9946c6f 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -37,16 +37,15 @@ SECTIONS
37 KPROBES_TEXT 37 KPROBES_TEXT
38 *(.fixup) 38 *(.fixup)
39 *(.gnu.warning) 39 *(.gnu.warning)
40 } :text = 0x9090 40 _etext = .; /* End of text section */
41 /* out-of-line lock text */ 41 } :text = 0x9090
42 .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
43
44 _etext = .; /* End of text section */
45 42
46 . = ALIGN(16); /* Exception table */ 43 . = ALIGN(16); /* Exception table */
47 __start___ex_table = .; 44 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
48 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 45 __start___ex_table = .;
49 __stop___ex_table = .; 46 *(__ex_table)
47 __stop___ex_table = .;
48 }
50 49
51 NOTES :text :note 50 NOTES :text :note
52 51
@@ -155,12 +154,15 @@ SECTIONS
155 __init_begin = .; 154 __init_begin = .;
156 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 155 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
157 _sinittext = .; 156 _sinittext = .;
158 *(.init.text) 157 INIT_TEXT
159 _einittext = .; 158 _einittext = .;
160 } 159 }
161 __initdata_begin = .; 160 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
162 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 161 __initdata_begin = .;
163 __initdata_end = .; 162 INIT_DATA
163 __initdata_end = .;
164 }
165
164 . = ALIGN(16); 166 . = ALIGN(16);
165 __setup_start = .; 167 __setup_start = .;
166 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 168 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
@@ -176,6 +178,14 @@ SECTIONS
176 } 178 }
177 __con_initcall_end = .; 179 __con_initcall_end = .;
178 SECURITY_INIT 180 SECURITY_INIT
181
182 . = ALIGN(8);
183 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
184 __parainstructions = .;
185 *(.parainstructions)
186 __parainstructions_end = .;
187 }
188
179 . = ALIGN(8); 189 . = ALIGN(8);
180 __alt_instructions = .; 190 __alt_instructions = .;
181 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 191 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
@@ -187,8 +197,12 @@ SECTIONS
187 } 197 }
188 /* .exit.text is discard at runtime, not link time, to deal with references 198 /* .exit.text is discard at runtime, not link time, to deal with references
189 from .altinstructions and .eh_frame */ 199 from .altinstructions and .eh_frame */
190 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 200 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
191 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 201 EXIT_TEXT
202 }
203 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
204 EXIT_DATA
205 }
192 206
193/* vdso blob that is mapped into user space */ 207/* vdso blob that is mapped into user space */
194 vdso_start = . ; 208 vdso_start = . ;
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 414caf0c5f9a..d971210a6d36 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -25,21 +25,24 @@ static int __init vsmp_init(void)
25 return 0; 25 return 0;
26 26
27 /* Check if we are running on a ScaleMP vSMP box */ 27 /* Check if we are running on a ScaleMP vSMP box */
28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || 28 if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) !=
29 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) 29 PCI_VENDOR_ID_SCALEMP) ||
30 (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) !=
31 PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
30 return 0; 32 return 0;
31 33
32 /* set vSMP magic bits to indicate vSMP capable kernel */ 34 /* set vSMP magic bits to indicate vSMP capable kernel */
33 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); 35 address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
34 cap = readl(address); 36 cap = readl(address);
35 ctl = readl(address + 4); 37 ctl = readl(address + 4);
36 printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); 38 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
39 cap, ctl);
37 if (cap & ctl & (1 << 4)) { 40 if (cap & ctl & (1 << 4)) {
38 /* Turn on vSMP IRQ fastpath handling (see system.h) */ 41 /* Turn on vSMP IRQ fastpath handling (see system.h) */
39 ctl &= ~(1 << 4); 42 ctl &= ~(1 << 4);
40 writel(ctl, address + 4); 43 writel(ctl, address + 4);
41 ctl = readl(address + 4); 44 ctl = readl(address + 4);
42 printk("vSMP CTL: control set to:0x%08x\n", ctl); 45 printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
43 } 46 }
44 47
45 iounmap(address); 48 iounmap(address);
diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/kernel/vsyscall-int80_32.S
deleted file mode 100644
index 103cab6aa7c0..000000000000
--- a/arch/x86/kernel/vsyscall-int80_32.S
+++ /dev/null
@@ -1,53 +0,0 @@
1/*
2 * Code for the vsyscall page. This version uses the old int $0x80 method.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10 .text
11 .globl __kernel_vsyscall
12 .type __kernel_vsyscall,@function
13__kernel_vsyscall:
14.LSTART_vsyscall:
15 int $0x80
16 ret
17.LEND_vsyscall:
18 .size __kernel_vsyscall,.-.LSTART_vsyscall
19 .previous
20
21 .section .eh_frame,"a",@progbits
22.LSTARTFRAMEDLSI:
23 .long .LENDCIEDLSI-.LSTARTCIEDLSI
24.LSTARTCIEDLSI:
25 .long 0 /* CIE ID */
26 .byte 1 /* Version number */
27 .string "zR" /* NUL-terminated augmentation string */
28 .uleb128 1 /* Code alignment factor */
29 .sleb128 -4 /* Data alignment factor */
30 .byte 8 /* Return address register column */
31 .uleb128 1 /* Augmentation value length */
32 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
33 .byte 0x0c /* DW_CFA_def_cfa */
34 .uleb128 4
35 .uleb128 4
36 .byte 0x88 /* DW_CFA_offset, column 0x8 */
37 .uleb128 1
38 .align 4
39.LENDCIEDLSI:
40 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
41.LSTARTFDEDLSI:
42 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
43 .long .LSTART_vsyscall-. /* PC-relative start address */
44 .long .LEND_vsyscall-.LSTART_vsyscall
45 .uleb128 0
46 .align 4
47.LENDFDEDLSI:
48 .previous
49
50/*
51 * Get the common code for the sigreturn entry points.
52 */
53#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/kernel/vsyscall-note_32.S
deleted file mode 100644
index fcf376a37f79..000000000000
--- a/arch/x86/kernel/vsyscall-note_32.S
+++ /dev/null
@@ -1,45 +0,0 @@
1/*
2 * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text.
3 * Here we can supply some information useful to userland.
4 */
5
6#include <linux/version.h>
7#include <linux/elfnote.h>
8
9/* Ideally this would use UTS_NAME, but using a quoted string here
10 doesn't work. Remember to change this when changing the
11 kernel's name. */
12ELFNOTE_START(Linux, 0, "a")
13 .long LINUX_VERSION_CODE
14ELFNOTE_END
15
16#ifdef CONFIG_XEN
17/*
18 * Add a special note telling glibc's dynamic linker a fake hardware
19 * flavor that it will use to choose the search path for libraries in the
20 * same way it uses real hardware capabilities like "mmx".
21 * We supply "nosegneg" as the fake capability, to indicate that we
22 * do not like negative offsets in instructions using segment overrides,
23 * since we implement those inefficiently. This makes it possible to
24 * install libraries optimized to avoid those access patterns in someplace
25 * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file
26 * corresponding to the bits here is needed to make ldconfig work right.
27 * It should contain:
28 * hwcap 1 nosegneg
29 * to match the mapping of bit to name that we give here.
30 *
31 * At runtime, the fake hardware feature will be considered to be present
32 * if its bit is set in the mask word. So, we start with the mask 0, and
33 * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen.
34 */
35
36#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */
37
38 .globl VDSO_NOTE_MASK
39ELFNOTE_START(GNU, 2, "a")
40 .long 1 /* ncaps */
41VDSO_NOTE_MASK:
42 .long 0 /* mask */
43 .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */
44ELFNOTE_END
45#endif
diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/kernel/vsyscall-sigreturn_32.S
deleted file mode 100644
index a92262f41659..000000000000
--- a/arch/x86/kernel/vsyscall-sigreturn_32.S
+++ /dev/null
@@ -1,143 +0,0 @@
1/*
2 * Common code for the sigreturn entry points on the vsyscall page.
3 * So far this code is the same for both int80 and sysenter versions.
4 * This file is #include'd by vsyscall-*.S to define them after the
5 * vsyscall entry point. The kernel assumes that the addresses of these
6 * routines are constant for all vsyscall implementations.
7 */
8
9#include <asm/unistd.h>
10#include <asm/asm-offsets.h>
11
12
13/* XXX
14 Should these be named "_sigtramp" or something?
15*/
16
17 .text
18 .org __kernel_vsyscall+32,0x90
19 .globl __kernel_sigreturn
20 .type __kernel_sigreturn,@function
21__kernel_sigreturn:
22.LSTART_sigreturn:
23 popl %eax /* XXX does this mean it needs unwind info? */
24 movl $__NR_sigreturn, %eax
25 int $0x80
26.LEND_sigreturn:
27 .size __kernel_sigreturn,.-.LSTART_sigreturn
28
29 .balign 32
30 .globl __kernel_rt_sigreturn
31 .type __kernel_rt_sigreturn,@function
32__kernel_rt_sigreturn:
33.LSTART_rt_sigreturn:
34 movl $__NR_rt_sigreturn, %eax
35 int $0x80
36.LEND_rt_sigreturn:
37 .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
38 .balign 32
39 .previous
40
41 .section .eh_frame,"a",@progbits
42.LSTARTFRAMEDLSI1:
43 .long .LENDCIEDLSI1-.LSTARTCIEDLSI1
44.LSTARTCIEDLSI1:
45 .long 0 /* CIE ID */
46 .byte 1 /* Version number */
47 .string "zRS" /* NUL-terminated augmentation string */
48 .uleb128 1 /* Code alignment factor */
49 .sleb128 -4 /* Data alignment factor */
50 .byte 8 /* Return address register column */
51 .uleb128 1 /* Augmentation value length */
52 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
53 .byte 0 /* DW_CFA_nop */
54 .align 4
55.LENDCIEDLSI1:
56 .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */
57.LSTARTFDEDLSI1:
58 .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */
59 /* HACK: The dwarf2 unwind routines will subtract 1 from the
60 return address to get an address in the middle of the
61 presumed call instruction. Since we didn't get here via
62 a call, we need to include the nop before the real start
63 to make up for it. */
64 .long .LSTART_sigreturn-1-. /* PC-relative start address */
65 .long .LEND_sigreturn-.LSTART_sigreturn+1
66 .uleb128 0 /* Augmentation */
67 /* What follows are the instructions for the table generation.
68 We record the locations of each register saved. This is
69 complicated by the fact that the "CFA" is always assumed to
70 be the value of the stack pointer in the caller. This means
71 that we must define the CFA of this body of code to be the
72 saved value of the stack pointer in the sigcontext. Which
73 also means that there is no fixed relation to the other
74 saved registers, which means that we must use DW_CFA_expression
75 to compute their addresses. It also means that when we
76 adjust the stack with the popl, we have to do it all over again. */
77
78#define do_cfa_expr(offset) \
79 .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
80 .uleb128 1f-0f; /* length */ \
810: .byte 0x74; /* DW_OP_breg4 */ \
82 .sleb128 offset; /* offset */ \
83 .byte 0x06; /* DW_OP_deref */ \
841:
85
86#define do_expr(regno, offset) \
87 .byte 0x10; /* DW_CFA_expression */ \
88 .uleb128 regno; /* regno */ \
89 .uleb128 1f-0f; /* length */ \
900: .byte 0x74; /* DW_OP_breg4 */ \
91 .sleb128 offset; /* offset */ \
921:
93
94 do_cfa_expr(SIGCONTEXT_esp+4)
95 do_expr(0, SIGCONTEXT_eax+4)
96 do_expr(1, SIGCONTEXT_ecx+4)
97 do_expr(2, SIGCONTEXT_edx+4)
98 do_expr(3, SIGCONTEXT_ebx+4)
99 do_expr(5, SIGCONTEXT_ebp+4)
100 do_expr(6, SIGCONTEXT_esi+4)
101 do_expr(7, SIGCONTEXT_edi+4)
102 do_expr(8, SIGCONTEXT_eip+4)
103
104 .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
105
106 do_cfa_expr(SIGCONTEXT_esp)
107 do_expr(0, SIGCONTEXT_eax)
108 do_expr(1, SIGCONTEXT_ecx)
109 do_expr(2, SIGCONTEXT_edx)
110 do_expr(3, SIGCONTEXT_ebx)
111 do_expr(5, SIGCONTEXT_ebp)
112 do_expr(6, SIGCONTEXT_esi)
113 do_expr(7, SIGCONTEXT_edi)
114 do_expr(8, SIGCONTEXT_eip)
115
116 .align 4
117.LENDFDEDLSI1:
118
119 .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */
120.LSTARTFDEDLSI2:
121 .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */
122 /* HACK: See above wrt unwind library assumptions. */
123 .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
124 .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
125 .uleb128 0 /* Augmentation */
126 /* What follows are the instructions for the table generation.
127 We record the locations of each register saved. This is
128 slightly less complicated than the above, since we don't
129 modify the stack pointer in the process. */
130
131 do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp)
132 do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax)
133 do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx)
134 do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx)
135 do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx)
136 do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp)
137 do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi)
138 do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi)
139 do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip)
140
141 .align 4
142.LENDFDEDLSI2:
143 .previous
diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/kernel/vsyscall-sysenter_32.S
deleted file mode 100644
index ed879bf42995..000000000000
--- a/arch/x86/kernel/vsyscall-sysenter_32.S
+++ /dev/null
@@ -1,122 +0,0 @@
1/*
2 * Code for the vsyscall page. This version uses the sysenter instruction.
3 *
4 * NOTE:
5 * 1) __kernel_vsyscall _must_ be first in this page.
6 * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S
7 * for details.
8 */
9
10/*
11 * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
12 * %ecx itself for arg2. The pushing is because the sysexit instruction
13 * (found in entry.S) requires that we clobber %ecx with the desired %esp.
14 * User code might expect that %ecx is unclobbered though, as it would be
15 * for returning via the iret instruction, so we must push and pop.
16 *
17 * The caller puts arg3 in %edx, which the sysexit instruction requires
18 * for %eip. Thus, exactly as for arg2, we must push and pop.
19 *
20 * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
21 * instruction clobbers %esp, the user's %esp won't even survive entry
22 * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
23 * arg6 from the stack.
24 *
25 * You can not use this vsyscall for the clone() syscall because the
26 * three dwords on the parent stack do not get copied to the child.
27 */
28 .text
29 .globl __kernel_vsyscall
30 .type __kernel_vsyscall,@function
31__kernel_vsyscall:
32.LSTART_vsyscall:
33 push %ecx
34.Lpush_ecx:
35 push %edx
36.Lpush_edx:
37 push %ebp
38.Lenter_kernel:
39 movl %esp,%ebp
40 sysenter
41
42 /* 7: align return point with nop's to make disassembly easier */
43 .space 7,0x90
44
45 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
46 jmp .Lenter_kernel
47 /* 16: System call normal return point is here! */
48 .globl SYSENTER_RETURN /* Symbol used by sysenter.c */
49SYSENTER_RETURN:
50 pop %ebp
51.Lpop_ebp:
52 pop %edx
53.Lpop_edx:
54 pop %ecx
55.Lpop_ecx:
56 ret
57.LEND_vsyscall:
58 .size __kernel_vsyscall,.-.LSTART_vsyscall
59 .previous
60
61 .section .eh_frame,"a",@progbits
62.LSTARTFRAMEDLSI:
63 .long .LENDCIEDLSI-.LSTARTCIEDLSI
64.LSTARTCIEDLSI:
65 .long 0 /* CIE ID */
66 .byte 1 /* Version number */
67 .string "zR" /* NUL-terminated augmentation string */
68 .uleb128 1 /* Code alignment factor */
69 .sleb128 -4 /* Data alignment factor */
70 .byte 8 /* Return address register column */
71 .uleb128 1 /* Augmentation value length */
72 .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
73 .byte 0x0c /* DW_CFA_def_cfa */
74 .uleb128 4
75 .uleb128 4
76 .byte 0x88 /* DW_CFA_offset, column 0x8 */
77 .uleb128 1
78 .align 4
79.LENDCIEDLSI:
80 .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
81.LSTARTFDEDLSI:
82 .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
83 .long .LSTART_vsyscall-. /* PC-relative start address */
84 .long .LEND_vsyscall-.LSTART_vsyscall
85 .uleb128 0
86 /* What follows are the instructions for the table generation.
87 We have to record all changes of the stack pointer. */
88 .byte 0x04 /* DW_CFA_advance_loc4 */
89 .long .Lpush_ecx-.LSTART_vsyscall
90 .byte 0x0e /* DW_CFA_def_cfa_offset */
91 .byte 0x08 /* RA at offset 8 now */
92 .byte 0x04 /* DW_CFA_advance_loc4 */
93 .long .Lpush_edx-.Lpush_ecx
94 .byte 0x0e /* DW_CFA_def_cfa_offset */
95 .byte 0x0c /* RA at offset 12 now */
96 .byte 0x04 /* DW_CFA_advance_loc4 */
97 .long .Lenter_kernel-.Lpush_edx
98 .byte 0x0e /* DW_CFA_def_cfa_offset */
99 .byte 0x10 /* RA at offset 16 now */
100 .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
101 /* Finally the epilogue. */
102 .byte 0x04 /* DW_CFA_advance_loc4 */
103 .long .Lpop_ebp-.Lenter_kernel
104 .byte 0x0e /* DW_CFA_def_cfa_offset */
105 .byte 0x0c /* RA at offset 12 now */
106 .byte 0xc5 /* DW_CFA_restore %ebp */
107 .byte 0x04 /* DW_CFA_advance_loc4 */
108 .long .Lpop_edx-.Lpop_ebp
109 .byte 0x0e /* DW_CFA_def_cfa_offset */
110 .byte 0x08 /* RA at offset 8 now */
111 .byte 0x04 /* DW_CFA_advance_loc4 */
112 .long .Lpop_ecx-.Lpop_edx
113 .byte 0x0e /* DW_CFA_def_cfa_offset */
114 .byte 0x04 /* RA at offset 4 now */
115 .align 4
116.LENDFDEDLSI:
117 .previous
118
119/*
120 * Get the common code for the sigreturn entry points.
121 */
122#include "vsyscall-sigreturn_32.S"
diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S
deleted file mode 100644
index a5ab3dc4fd25..000000000000
--- a/arch/x86/kernel/vsyscall_32.S
+++ /dev/null
@@ -1,15 +0,0 @@
1#include <linux/init.h>
2
3__INITDATA
4
5 .globl vsyscall_int80_start, vsyscall_int80_end
6vsyscall_int80_start:
7 .incbin "arch/x86/kernel/vsyscall-int80_32.so"
8vsyscall_int80_end:
9
10 .globl vsyscall_sysenter_start, vsyscall_sysenter_end
11vsyscall_sysenter_start:
12 .incbin "arch/x86/kernel/vsyscall-sysenter_32.so"
13vsyscall_sysenter_end:
14
15__FINIT
diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S
deleted file mode 100644
index 4a8b0ed9b8fb..000000000000
--- a/arch/x86/kernel/vsyscall_32.lds.S
+++ /dev/null
@@ -1,67 +0,0 @@
1/*
2 * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
3 * object prelinked to its virtual address, and with only one read-only
4 * segment (that fits in one page). This script controls its layout.
5 */
6#include <asm/asm-offsets.h>
7
8SECTIONS
9{
10 . = VDSO_PRELINK_asm + SIZEOF_HEADERS;
11
12 .hash : { *(.hash) } :text
13 .gnu.hash : { *(.gnu.hash) }
14 .dynsym : { *(.dynsym) }
15 .dynstr : { *(.dynstr) }
16 .gnu.version : { *(.gnu.version) }
17 .gnu.version_d : { *(.gnu.version_d) }
18 .gnu.version_r : { *(.gnu.version_r) }
19
20 /* This linker script is used both with -r and with -shared.
21 For the layouts to match, we need to skip more than enough
22 space for the dynamic symbol table et al. If this amount
23 is insufficient, ld -shared will barf. Just increase it here. */
24 . = VDSO_PRELINK_asm + 0x400;
25
26 .text : { *(.text) } :text =0x90909090
27 .note : { *(.note.*) } :text :note
28 .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
29 .eh_frame : { KEEP (*(.eh_frame)) } :text
30 .dynamic : { *(.dynamic) } :text :dynamic
31 .useless : {
32 *(.got.plt) *(.got)
33 *(.data .data.* .gnu.linkonce.d.*)
34 *(.dynbss)
35 *(.bss .bss.* .gnu.linkonce.b.*)
36 } :text
37}
38
39/*
40 * We must supply the ELF program headers explicitly to get just one
41 * PT_LOAD segment, and set the flags explicitly to make segments read-only.
42 */
43PHDRS
44{
45 text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
46 dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
47 note PT_NOTE FLAGS(4); /* PF_R */
48 eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
49}
50
51/*
52 * This controls what symbols we export from the DSO.
53 */
54VERSION
55{
56 LINUX_2.5 {
57 global:
58 __kernel_vsyscall;
59 __kernel_sigreturn;
60 __kernel_rt_sigreturn;
61
62 local: *;
63 };
64}
65
66/* The ELF entry point can be used to set the AT_SYSINFO value. */
67ENTRY(__kernel_vsyscall);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ad4005c6d4a1..3f8242774580 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -43,7 +43,7 @@
43#include <asm/vgtod.h> 43#include <asm/vgtod.h>
44 44
45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) 45#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
46#define __syscall_clobber "r11","rcx","memory" 46#define __syscall_clobber "r11","cx","memory"
47#define __pa_vsymbol(x) \ 47#define __pa_vsymbol(x) \
48 ({unsigned long v; \ 48 ({unsigned long v; \
49 extern char __vsyscall_0; \ 49 extern char __vsyscall_0; \
@@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t)
190long __vsyscall(2) 190long __vsyscall(2)
191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) 191vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
192{ 192{
193 unsigned int dummy, p; 193 unsigned int p;
194 unsigned long j = 0; 194 unsigned long j = 0;
195 195
196 /* Fast cache - only recompute value once per jiffies and avoid 196 /* Fast cache - only recompute value once per jiffies and avoid
@@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
205 p = tcache->blob[1]; 205 p = tcache->blob[1];
206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { 206 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
207 /* Load per CPU data from RDTSCP */ 207 /* Load per CPU data from RDTSCP */
208 rdtscp(dummy, dummy, p); 208 native_read_tscp(&p);
209 } else { 209 } else {
210 /* Load per CPU data from GDT */ 210 /* Load per CPU data from GDT */
211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 211 asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
@@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
297 /* Store cpu number in limit so that it can be loaded quickly 297 /* Store cpu number in limit so that it can be loaded quickly
298 in user space in vgetcpu. 298 in user space in vgetcpu.
299 12 bits for the CPU and 8 bits for the node. */ 299 12 bits for the CPU and 8 bits for the node. */
300 d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); 300 d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU);
301 *d = 0x0f40000000000ULL; 301 *d = 0x0f40000000000ULL;
302 *d |= cpu; 302 *d |= cpu;
303 *d |= (node & 0xf) << 12; 303 *d |= (node & 0xf) << 12;
@@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
319 return NOTIFY_DONE; 319 return NOTIFY_DONE;
320} 320}
321 321
322static void __init map_vsyscall(void) 322void __init map_vsyscall(void)
323{ 323{
324 extern char __vsyscall_0; 324 extern char __vsyscall_0;
325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); 325 unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
@@ -335,7 +335,6 @@ static int __init vsyscall_init(void)
335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); 335 BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); 336 BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); 337 BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
338 map_vsyscall();
339#ifdef CONFIG_SYSCTL 338#ifdef CONFIG_SYSCTL
340 register_sysctl_table(kernel_root_table2); 339 register_sysctl_table(kernel_root_table2);
341#endif 340#endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 77c25b307635..a66e9c1a0537 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -8,6 +8,7 @@
8#include <asm/processor.h> 8#include <asm/processor.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/desc.h>
11 12
12EXPORT_SYMBOL(kernel_thread); 13EXPORT_SYMBOL(kernel_thread);
13 14
@@ -34,13 +35,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic);
34EXPORT_SYMBOL(copy_page); 35EXPORT_SYMBOL(copy_page);
35EXPORT_SYMBOL(clear_page); 36EXPORT_SYMBOL(clear_page);
36 37
37#ifdef CONFIG_SMP
38extern void __write_lock_failed(rwlock_t *rw);
39extern void __read_lock_failed(rwlock_t *rw);
40EXPORT_SYMBOL(__write_lock_failed);
41EXPORT_SYMBOL(__read_lock_failed);
42#endif
43
44/* Export string functions. We normally rely on gcc builtin for most of these, 38/* Export string functions. We normally rely on gcc builtin for most of these,
45 but gcc sometimes decides not to inline them. */ 39 but gcc sometimes decides not to inline them. */
46#undef memcpy 40#undef memcpy
@@ -60,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 54EXPORT_SYMBOL(load_gs_index);
61 55
62EXPORT_SYMBOL(_proxy_pda); 56EXPORT_SYMBOL(_proxy_pda);
57
58#ifdef CONFIG_PARAVIRT
59/* Virtualized guests may want to use it */
60EXPORT_SYMBOL_GPL(cpu_gdt_descr);
61#endif